Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- GPT_SoVITS/TTS_infer_pack/TTS.py +1463 -0
- GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py +237 -0
- GPT_SoVITS/TTS_infer_pack/__init__.py +1 -0
- GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py +189 -0
- GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin +3 -0
- GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin +3 -0
- GPT_SoVITS/pretrained_models/fast_langdetect/lid.176.bin +3 -0
- GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt +3 -0
- GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2D2333k.pth +3 -0
- GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth +3 -0
- GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth +3 -0
- GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth +3 -0
- GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x/bigvgan_generator.pt +3 -0
- GPT_SoVITS/pretrained_models/s1v3.ckpt +3 -0
- GPT_SoVITS/pretrained_models/s2G488k.pth +3 -0
- GPT_SoVITS/pretrained_models/s2Gv3.pth +3 -0
- GPT_SoVITS/text/G2PWModel/g2pW.onnx +3 -0
- GPT_SoVITS/text/engdict_cache.pickle +3 -0
- GPT_SoVITS/text/g2pw/polyphonic.pickle +3 -0
- GPT_SoVITS/text/ja_userdic/userdict.csv +3 -0
- GPT_SoVITS/text/namedict_cache.pickle +3 -0
- docs/cn/Changelog_CN.md +302 -0
- docs/cn/README.md +395 -0
- docs/en/Changelog_EN.md +222 -0
- docs/ja/Changelog_JA.md +221 -0
- docs/ja/README.md +383 -0
- docs/ko/Changelog_KO.md +222 -0
- docs/ko/README.md +389 -0
- docs/tr/Changelog_TR.md +222 -0
- docs/tr/README.md +385 -0
- tools/AP_BWE_main/24kto48k/readme.txt +11 -0
- tools/AP_BWE_main/LICENSE +21 -0
- tools/AP_BWE_main/README.md +91 -0
- tools/AP_BWE_main/datasets1/__init__.py +1 -0
- tools/AP_BWE_main/datasets1/dataset.py +108 -0
- tools/AP_BWE_main/models/__init__.py +1 -0
- tools/AP_BWE_main/models/model.py +464 -0
- tools/__init__.py +0 -0
- tools/asr/config.py +36 -0
- tools/asr/fasterwhisper_asr.py +129 -0
- tools/asr/funasr_asr.py +118 -0
- tools/asr/models/.gitignore +2 -0
- tools/audio_sr.py +50 -0
- tools/cmd-denoise.py +38 -0
- tools/denoise-model/.gitignore +2 -0
- tools/i18n/i18n.py +41 -0
- tools/i18n/locale/en_US.json +211 -0
- tools/i18n/locale/es_ES.json +211 -0
- tools/i18n/locale/fr_FR.json +211 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
GPT_SoVITS/text/ja_userdic/userdict.csv filter=lfs diff=lfs merge=lfs -text
|
GPT_SoVITS/TTS_infer_pack/TTS.py
ADDED
@@ -0,0 +1,1463 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gc
|
2 |
+
import math
|
3 |
+
import os
|
4 |
+
import random
|
5 |
+
import sys
|
6 |
+
import time
|
7 |
+
import traceback
|
8 |
+
from copy import deepcopy
|
9 |
+
|
10 |
+
import torchaudio
|
11 |
+
from tqdm import tqdm
|
12 |
+
|
13 |
+
now_dir = os.getcwd()
|
14 |
+
sys.path.append(now_dir)
|
15 |
+
import os
|
16 |
+
from typing import List, Tuple, Union
|
17 |
+
|
18 |
+
import ffmpeg
|
19 |
+
import librosa
|
20 |
+
import numpy as np
|
21 |
+
import torch
|
22 |
+
import torch.nn.functional as F
|
23 |
+
import yaml
|
24 |
+
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
25 |
+
from BigVGAN.bigvgan import BigVGAN
|
26 |
+
from feature_extractor.cnhubert import CNHubert
|
27 |
+
from module.mel_processing import mel_spectrogram_torch, spectrogram_torch
|
28 |
+
from module.models import SynthesizerTrn, SynthesizerTrnV3
|
29 |
+
from peft import LoraConfig, get_peft_model
|
30 |
+
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
|
31 |
+
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
32 |
+
|
33 |
+
from tools.audio_sr import AP_BWE
|
34 |
+
from tools.i18n.i18n import I18nAuto, scan_language_list
|
35 |
+
from tools.my_utils import load_audio
|
36 |
+
from TTS_infer_pack.text_segmentation_method import splits
|
37 |
+
from TTS_infer_pack.TextPreprocessor import TextPreprocessor
|
38 |
+
|
39 |
+
language = os.environ.get("language", "Auto")
|
40 |
+
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
41 |
+
i18n = I18nAuto(language=language)
|
42 |
+
|
43 |
+
|
44 |
+
spec_min = -12
|
45 |
+
spec_max = 2
|
46 |
+
|
47 |
+
|
48 |
+
def norm_spec(x):
|
49 |
+
return (x - spec_min) / (spec_max - spec_min) * 2 - 1
|
50 |
+
|
51 |
+
|
52 |
+
def denorm_spec(x):
|
53 |
+
return (x + 1) / 2 * (spec_max - spec_min) + spec_min
|
54 |
+
|
55 |
+
|
56 |
+
mel_fn = lambda x: mel_spectrogram_torch(
|
57 |
+
x,
|
58 |
+
**{
|
59 |
+
"n_fft": 1024,
|
60 |
+
"win_size": 1024,
|
61 |
+
"hop_size": 256,
|
62 |
+
"num_mels": 100,
|
63 |
+
"sampling_rate": 24000,
|
64 |
+
"fmin": 0,
|
65 |
+
"fmax": None,
|
66 |
+
"center": False,
|
67 |
+
},
|
68 |
+
)
|
69 |
+
|
70 |
+
|
71 |
+
def speed_change(input_audio: np.ndarray, speed: float, sr: int):
|
72 |
+
# 将 NumPy 数组转换为原始 PCM 流
|
73 |
+
raw_audio = input_audio.astype(np.int16).tobytes()
|
74 |
+
|
75 |
+
# 设置 ffmpeg 输入流
|
76 |
+
input_stream = ffmpeg.input("pipe:", format="s16le", acodec="pcm_s16le", ar=str(sr), ac=1)
|
77 |
+
|
78 |
+
# 变速处理
|
79 |
+
output_stream = input_stream.filter("atempo", speed)
|
80 |
+
|
81 |
+
# 输出流到管道
|
82 |
+
out, _ = output_stream.output("pipe:", format="s16le", acodec="pcm_s16le").run(
|
83 |
+
input=raw_audio, capture_stdout=True, capture_stderr=True
|
84 |
+
)
|
85 |
+
|
86 |
+
# 将管道输出解码为 NumPy 数组
|
87 |
+
processed_audio = np.frombuffer(out, np.int16)
|
88 |
+
|
89 |
+
return processed_audio
|
90 |
+
|
91 |
+
|
92 |
+
resample_transform_dict = {}
|
93 |
+
|
94 |
+
|
95 |
+
def resample(audio_tensor, sr0, device):
|
96 |
+
global resample_transform_dict
|
97 |
+
if sr0 not in resample_transform_dict:
|
98 |
+
resample_transform_dict[sr0] = torchaudio.transforms.Resample(sr0, 24000).to(device)
|
99 |
+
return resample_transform_dict[sr0](audio_tensor)
|
100 |
+
|
101 |
+
|
102 |
+
class DictToAttrRecursive(dict):
|
103 |
+
def __init__(self, input_dict):
|
104 |
+
super().__init__(input_dict)
|
105 |
+
for key, value in input_dict.items():
|
106 |
+
if isinstance(value, dict):
|
107 |
+
value = DictToAttrRecursive(value)
|
108 |
+
self[key] = value
|
109 |
+
setattr(self, key, value)
|
110 |
+
|
111 |
+
def __getattr__(self, item):
|
112 |
+
try:
|
113 |
+
return self[item]
|
114 |
+
except KeyError:
|
115 |
+
raise AttributeError(f"Attribute {item} not found")
|
116 |
+
|
117 |
+
def __setattr__(self, key, value):
|
118 |
+
if isinstance(value, dict):
|
119 |
+
value = DictToAttrRecursive(value)
|
120 |
+
super(DictToAttrRecursive, self).__setitem__(key, value)
|
121 |
+
super().__setattr__(key, value)
|
122 |
+
|
123 |
+
def __delattr__(self, item):
|
124 |
+
try:
|
125 |
+
del self[item]
|
126 |
+
except KeyError:
|
127 |
+
raise AttributeError(f"Attribute {item} not found")
|
128 |
+
|
129 |
+
|
130 |
+
class NO_PROMPT_ERROR(Exception):
|
131 |
+
pass
|
132 |
+
|
133 |
+
|
134 |
+
# configs/tts_infer.yaml
|
135 |
+
"""
|
136 |
+
custom:
|
137 |
+
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
|
138 |
+
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
139 |
+
device: cpu
|
140 |
+
is_half: false
|
141 |
+
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
|
142 |
+
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
|
143 |
+
version: v2
|
144 |
+
default:
|
145 |
+
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
|
146 |
+
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
147 |
+
device: cpu
|
148 |
+
is_half: false
|
149 |
+
t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
|
150 |
+
vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
|
151 |
+
version: v1
|
152 |
+
default_v2:
|
153 |
+
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
|
154 |
+
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
155 |
+
device: cpu
|
156 |
+
is_half: false
|
157 |
+
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
|
158 |
+
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
|
159 |
+
version: v2
|
160 |
+
default_v3:
|
161 |
+
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
|
162 |
+
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
163 |
+
device: cpu
|
164 |
+
is_half: false
|
165 |
+
t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt
|
166 |
+
vits_weights_path: GPT_SoVITS/pretrained_models/s2Gv3.pth
|
167 |
+
version: v3
|
168 |
+
"""
|
169 |
+
|
170 |
+
|
171 |
+
def set_seed(seed: int):
|
172 |
+
seed = int(seed)
|
173 |
+
seed = seed if seed != -1 else random.randint(0, 2**32 - 1)
|
174 |
+
print(f"Set seed to {seed}")
|
175 |
+
os.environ["PYTHONHASHSEED"] = str(seed)
|
176 |
+
random.seed(seed)
|
177 |
+
np.random.seed(seed)
|
178 |
+
torch.manual_seed(seed)
|
179 |
+
try:
|
180 |
+
if torch.cuda.is_available():
|
181 |
+
torch.cuda.manual_seed(seed)
|
182 |
+
torch.cuda.manual_seed_all(seed)
|
183 |
+
# torch.backends.cudnn.deterministic = True
|
184 |
+
# torch.backends.cudnn.benchmark = False
|
185 |
+
# torch.backends.cudnn.enabled = True
|
186 |
+
# 开启后会影响精度
|
187 |
+
torch.backends.cuda.matmul.allow_tf32 = False
|
188 |
+
torch.backends.cudnn.allow_tf32 = False
|
189 |
+
except:
|
190 |
+
pass
|
191 |
+
return seed
|
192 |
+
|
193 |
+
|
194 |
+
class TTS_Config:
|
195 |
+
default_configs = {
|
196 |
+
"v1": {
|
197 |
+
"device": "cpu",
|
198 |
+
"is_half": False,
|
199 |
+
"version": "v1",
|
200 |
+
"t2s_weights_path": "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
|
201 |
+
"vits_weights_path": "GPT_SoVITS/pretrained_models/s2G488k.pth",
|
202 |
+
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
|
203 |
+
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
|
204 |
+
},
|
205 |
+
"v2": {
|
206 |
+
"device": "cpu",
|
207 |
+
"is_half": False,
|
208 |
+
"version": "v2",
|
209 |
+
"t2s_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
|
210 |
+
"vits_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
|
211 |
+
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
|
212 |
+
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
|
213 |
+
},
|
214 |
+
"v3": {
|
215 |
+
"device": "cpu",
|
216 |
+
"is_half": False,
|
217 |
+
"version": "v3",
|
218 |
+
"t2s_weights_path": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
|
219 |
+
"vits_weights_path": "GPT_SoVITS/pretrained_models/s2Gv3.pth",
|
220 |
+
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
|
221 |
+
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
|
222 |
+
},
|
223 |
+
}
|
224 |
+
configs: dict = None
|
225 |
+
v1_languages: list = ["auto", "en", "zh", "ja", "all_zh", "all_ja"]
|
226 |
+
v2_languages: list = ["auto", "auto_yue", "en", "zh", "ja", "yue", "ko", "all_zh", "all_ja", "all_yue", "all_ko"]
|
227 |
+
languages: list = v2_languages
|
228 |
+
# "all_zh",#全部按中文识别
|
229 |
+
# "en",#全部按英文识别#######不变
|
230 |
+
# "all_ja",#全部按日文识别
|
231 |
+
# "all_yue",#全部按中文识别
|
232 |
+
# "all_ko",#全部按韩文识别
|
233 |
+
# "zh",#按中英混合识别####不变
|
234 |
+
# "ja",#按日英混合识别####不变
|
235 |
+
# "yue",#按粤英混合识别####不变
|
236 |
+
# "ko",#按韩英混合识别####不变
|
237 |
+
# "auto",#多语种启动切分识别语种
|
238 |
+
# "auto_yue",#多语种启动切分识别语种
|
239 |
+
|
240 |
+
def __init__(self, configs: Union[dict, str] = None):
|
241 |
+
# 设置默认配置文件路径
|
242 |
+
configs_base_path: str = "GPT_SoVITS/configs/"
|
243 |
+
os.makedirs(configs_base_path, exist_ok=True)
|
244 |
+
self.configs_path: str = os.path.join(configs_base_path, "tts_infer.yaml")
|
245 |
+
|
246 |
+
if configs in ["", None]:
|
247 |
+
if not os.path.exists(self.configs_path):
|
248 |
+
self.save_configs()
|
249 |
+
print(f"Create default config file at {self.configs_path}")
|
250 |
+
configs: dict = deepcopy(self.default_configs)
|
251 |
+
|
252 |
+
if isinstance(configs, str):
|
253 |
+
self.configs_path = configs
|
254 |
+
configs: dict = self._load_configs(self.configs_path)
|
255 |
+
|
256 |
+
assert isinstance(configs, dict)
|
257 |
+
version = configs.get("version", "v2").lower()
|
258 |
+
assert version in ["v1", "v2", "v3"]
|
259 |
+
self.default_configs[version] = configs.get(version, self.default_configs[version])
|
260 |
+
self.configs: dict = configs.get("custom", deepcopy(self.default_configs[version]))
|
261 |
+
|
262 |
+
self.device = self.configs.get("device", torch.device("cpu"))
|
263 |
+
if "cuda" in str(self.device) and not torch.cuda.is_available():
|
264 |
+
print("Warning: CUDA is not available, set device to CPU.")
|
265 |
+
self.device = torch.device("cpu")
|
266 |
+
|
267 |
+
self.is_half = self.configs.get("is_half", False)
|
268 |
+
# if str(self.device) == "cpu" and self.is_half:
|
269 |
+
# print(f"Warning: Half precision is not supported on CPU, set is_half to False.")
|
270 |
+
# self.is_half = False
|
271 |
+
|
272 |
+
self.version = version
|
273 |
+
self.t2s_weights_path = self.configs.get("t2s_weights_path", None)
|
274 |
+
self.vits_weights_path = self.configs.get("vits_weights_path", None)
|
275 |
+
self.bert_base_path = self.configs.get("bert_base_path", None)
|
276 |
+
self.cnhuhbert_base_path = self.configs.get("cnhuhbert_base_path", None)
|
277 |
+
self.languages = self.v1_languages if self.version == "v1" else self.v2_languages
|
278 |
+
|
279 |
+
self.is_v3_synthesizer: bool = False
|
280 |
+
|
281 |
+
if (self.t2s_weights_path in [None, ""]) or (not os.path.exists(self.t2s_weights_path)):
|
282 |
+
self.t2s_weights_path = self.default_configs[version]["t2s_weights_path"]
|
283 |
+
print(f"fall back to default t2s_weights_path: {self.t2s_weights_path}")
|
284 |
+
if (self.vits_weights_path in [None, ""]) or (not os.path.exists(self.vits_weights_path)):
|
285 |
+
self.vits_weights_path = self.default_configs[version]["vits_weights_path"]
|
286 |
+
print(f"fall back to default vits_weights_path: {self.vits_weights_path}")
|
287 |
+
if (self.bert_base_path in [None, ""]) or (not os.path.exists(self.bert_base_path)):
|
288 |
+
self.bert_base_path = self.default_configs[version]["bert_base_path"]
|
289 |
+
print(f"fall back to default bert_base_path: {self.bert_base_path}")
|
290 |
+
if (self.cnhuhbert_base_path in [None, ""]) or (not os.path.exists(self.cnhuhbert_base_path)):
|
291 |
+
self.cnhuhbert_base_path = self.default_configs[version]["cnhuhbert_base_path"]
|
292 |
+
print(f"fall back to default cnhuhbert_base_path: {self.cnhuhbert_base_path}")
|
293 |
+
self.update_configs()
|
294 |
+
|
295 |
+
self.max_sec = None
|
296 |
+
self.hz: int = 50
|
297 |
+
self.semantic_frame_rate: str = "25hz"
|
298 |
+
self.segment_size: int = 20480
|
299 |
+
self.filter_length: int = 2048
|
300 |
+
self.sampling_rate: int = 32000
|
301 |
+
self.hop_length: int = 640
|
302 |
+
self.win_length: int = 2048
|
303 |
+
self.n_speakers: int = 300
|
304 |
+
|
305 |
+
def _load_configs(self, configs_path: str) -> dict:
|
306 |
+
if os.path.exists(configs_path):
|
307 |
+
...
|
308 |
+
else:
|
309 |
+
print(i18n("路径不存在,使用默认配置"))
|
310 |
+
self.save_configs(configs_path)
|
311 |
+
with open(configs_path, "r", encoding="utf-8") as f:
|
312 |
+
configs = yaml.load(f, Loader=yaml.FullLoader)
|
313 |
+
|
314 |
+
return configs
|
315 |
+
|
316 |
+
def save_configs(self, configs_path: str = None) -> None:
|
317 |
+
configs = deepcopy(self.default_configs)
|
318 |
+
if self.configs is not None:
|
319 |
+
configs["custom"] = self.update_configs()
|
320 |
+
|
321 |
+
if configs_path is None:
|
322 |
+
configs_path = self.configs_path
|
323 |
+
with open(configs_path, "w") as f:
|
324 |
+
yaml.dump(configs, f)
|
325 |
+
|
326 |
+
def update_configs(self):
|
327 |
+
self.config = {
|
328 |
+
"device": str(self.device),
|
329 |
+
"is_half": self.is_half,
|
330 |
+
"version": self.version,
|
331 |
+
"t2s_weights_path": self.t2s_weights_path,
|
332 |
+
"vits_weights_path": self.vits_weights_path,
|
333 |
+
"bert_base_path": self.bert_base_path,
|
334 |
+
"cnhuhbert_base_path": self.cnhuhbert_base_path,
|
335 |
+
}
|
336 |
+
return self.config
|
337 |
+
|
338 |
+
def update_version(self, version: str) -> None:
|
339 |
+
self.version = version
|
340 |
+
self.languages = self.v1_languages if self.version == "v1" else self.v2_languages
|
341 |
+
|
342 |
+
def __str__(self):
|
343 |
+
self.configs = self.update_configs()
|
344 |
+
string = "TTS Config".center(100, "-") + "\n"
|
345 |
+
for k, v in self.configs.items():
|
346 |
+
string += f"{str(k).ljust(20)}: {str(v)}\n"
|
347 |
+
string += "-" * 100 + "\n"
|
348 |
+
return string
|
349 |
+
|
350 |
+
def __repr__(self):
|
351 |
+
return self.__str__()
|
352 |
+
|
353 |
+
def __hash__(self):
|
354 |
+
return hash(self.configs_path)
|
355 |
+
|
356 |
+
def __eq__(self, other):
|
357 |
+
return isinstance(other, TTS_Config) and self.configs_path == other.configs_path
|
358 |
+
|
359 |
+
|
360 |
+
class TTS:
|
361 |
+
def __init__(self, configs: Union[dict, str, TTS_Config]):
|
362 |
+
if isinstance(configs, TTS_Config):
|
363 |
+
self.configs = configs
|
364 |
+
else:
|
365 |
+
self.configs: TTS_Config = TTS_Config(configs)
|
366 |
+
|
367 |
+
self.t2s_model: Text2SemanticLightningModule = None
|
368 |
+
self.vits_model: Union[SynthesizerTrn, SynthesizerTrnV3] = None
|
369 |
+
self.bert_tokenizer: AutoTokenizer = None
|
370 |
+
self.bert_model: AutoModelForMaskedLM = None
|
371 |
+
self.cnhuhbert_model: CNHubert = None
|
372 |
+
self.bigvgan_model: BigVGAN = None
|
373 |
+
self.sr_model: AP_BWE = None
|
374 |
+
self.sr_model_not_exist: bool = False
|
375 |
+
|
376 |
+
self._init_models()
|
377 |
+
|
378 |
+
self.text_preprocessor: TextPreprocessor = TextPreprocessor(
|
379 |
+
self.bert_model, self.bert_tokenizer, self.configs.device
|
380 |
+
)
|
381 |
+
|
382 |
+
self.prompt_cache: dict = {
|
383 |
+
"ref_audio_path": None,
|
384 |
+
"prompt_semantic": None,
|
385 |
+
"refer_spec": [],
|
386 |
+
"prompt_text": None,
|
387 |
+
"prompt_lang": None,
|
388 |
+
"phones": None,
|
389 |
+
"bert_features": None,
|
390 |
+
"norm_text": None,
|
391 |
+
"aux_ref_audio_paths": [],
|
392 |
+
}
|
393 |
+
|
394 |
+
self.stop_flag: bool = False
|
395 |
+
self.precision: torch.dtype = torch.float16 if self.configs.is_half else torch.float32
|
396 |
+
|
397 |
+
def _init_models(
|
398 |
+
self,
|
399 |
+
):
|
400 |
+
self.init_t2s_weights(self.configs.t2s_weights_path)
|
401 |
+
self.init_vits_weights(self.configs.vits_weights_path)
|
402 |
+
self.init_bert_weights(self.configs.bert_base_path)
|
403 |
+
self.init_cnhuhbert_weights(self.configs.cnhuhbert_base_path)
|
404 |
+
# self.enable_half_precision(self.configs.is_half)
|
405 |
+
|
406 |
+
def init_cnhuhbert_weights(self, base_path: str):
|
407 |
+
print(f"Loading CNHuBERT weights from {base_path}")
|
408 |
+
self.cnhuhbert_model = CNHubert(base_path)
|
409 |
+
self.cnhuhbert_model = self.cnhuhbert_model.eval()
|
410 |
+
self.cnhuhbert_model = self.cnhuhbert_model.to(self.configs.device)
|
411 |
+
if self.configs.is_half and str(self.configs.device) != "cpu":
|
412 |
+
self.cnhuhbert_model = self.cnhuhbert_model.half()
|
413 |
+
|
414 |
+
def init_bert_weights(self, base_path: str):
|
415 |
+
print(f"Loading BERT weights from {base_path}")
|
416 |
+
self.bert_tokenizer = AutoTokenizer.from_pretrained(base_path)
|
417 |
+
self.bert_model = AutoModelForMaskedLM.from_pretrained(base_path)
|
418 |
+
self.bert_model = self.bert_model.eval()
|
419 |
+
self.bert_model = self.bert_model.to(self.configs.device)
|
420 |
+
if self.configs.is_half and str(self.configs.device) != "cpu":
|
421 |
+
self.bert_model = self.bert_model.half()
|
422 |
+
|
423 |
+
def init_vits_weights(self, weights_path: str):
|
424 |
+
self.configs.vits_weights_path = weights_path
|
425 |
+
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(weights_path)
|
426 |
+
path_sovits_v3 = self.configs.default_configs["v3"]["vits_weights_path"]
|
427 |
+
|
428 |
+
if if_lora_v3 == True and os.path.exists(path_sovits_v3) == False:
|
429 |
+
info = path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重")
|
430 |
+
raise FileExistsError(info)
|
431 |
+
|
432 |
+
# dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False)
|
433 |
+
dict_s2 = load_sovits_new(weights_path)
|
434 |
+
hps = dict_s2["config"]
|
435 |
+
|
436 |
+
hps["model"]["semantic_frame_rate"] = "25hz"
|
437 |
+
if "enc_p.text_embedding.weight" not in dict_s2["weight"]:
|
438 |
+
hps["model"]["version"] = "v2" # v3model,v2sybomls
|
439 |
+
elif dict_s2["weight"]["enc_p.text_embedding.weight"].shape[0] == 322:
|
440 |
+
hps["model"]["version"] = "v1"
|
441 |
+
else:
|
442 |
+
hps["model"]["version"] = "v2"
|
443 |
+
# version = hps["model"]["version"]
|
444 |
+
|
445 |
+
self.configs.filter_length = hps["data"]["filter_length"]
|
446 |
+
self.configs.segment_size = hps["train"]["segment_size"]
|
447 |
+
self.configs.sampling_rate = hps["data"]["sampling_rate"]
|
448 |
+
self.configs.hop_length = hps["data"]["hop_length"]
|
449 |
+
self.configs.win_length = hps["data"]["win_length"]
|
450 |
+
self.configs.n_speakers = hps["data"]["n_speakers"]
|
451 |
+
self.configs.semantic_frame_rate = hps["model"]["semantic_frame_rate"]
|
452 |
+
kwargs = hps["model"]
|
453 |
+
# print(f"self.configs.sampling_rate:{self.configs.sampling_rate}")
|
454 |
+
|
455 |
+
self.configs.update_version(model_version)
|
456 |
+
|
457 |
+
# print(f"model_version:{model_version}")
|
458 |
+
# print(f'hps["model"]["version"]:{hps["model"]["version"]}')
|
459 |
+
if model_version != "v3":
|
460 |
+
vits_model = SynthesizerTrn(
|
461 |
+
self.configs.filter_length // 2 + 1,
|
462 |
+
self.configs.segment_size // self.configs.hop_length,
|
463 |
+
n_speakers=self.configs.n_speakers,
|
464 |
+
**kwargs,
|
465 |
+
)
|
466 |
+
self.configs.is_v3_synthesizer = False
|
467 |
+
else:
|
468 |
+
vits_model = SynthesizerTrnV3(
|
469 |
+
self.configs.filter_length // 2 + 1,
|
470 |
+
self.configs.segment_size // self.configs.hop_length,
|
471 |
+
n_speakers=self.configs.n_speakers,
|
472 |
+
**kwargs,
|
473 |
+
)
|
474 |
+
self.configs.is_v3_synthesizer = True
|
475 |
+
self.init_bigvgan()
|
476 |
+
if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"):
|
477 |
+
del vits_model.enc_q
|
478 |
+
|
479 |
+
if if_lora_v3 == False:
|
480 |
+
print(
|
481 |
+
f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}"
|
482 |
+
)
|
483 |
+
else:
|
484 |
+
print(
|
485 |
+
f"Loading VITS pretrained weights from {weights_path}. {vits_model.load_state_dict(load_sovits_new(path_sovits_v3)['weight'], strict=False)}"
|
486 |
+
)
|
487 |
+
lora_rank = dict_s2["lora_rank"]
|
488 |
+
lora_config = LoraConfig(
|
489 |
+
target_modules=["to_k", "to_q", "to_v", "to_out.0"],
|
490 |
+
r=lora_rank,
|
491 |
+
lora_alpha=lora_rank,
|
492 |
+
init_lora_weights=True,
|
493 |
+
)
|
494 |
+
vits_model.cfm = get_peft_model(vits_model.cfm, lora_config)
|
495 |
+
print(
|
496 |
+
f"Loading LoRA weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}"
|
497 |
+
)
|
498 |
+
|
499 |
+
vits_model.cfm = vits_model.cfm.merge_and_unload()
|
500 |
+
|
501 |
+
vits_model = vits_model.to(self.configs.device)
|
502 |
+
vits_model = vits_model.eval()
|
503 |
+
|
504 |
+
self.vits_model = vits_model
|
505 |
+
if self.configs.is_half and str(self.configs.device) != "cpu":
|
506 |
+
self.vits_model = self.vits_model.half()
|
507 |
+
|
508 |
+
def init_t2s_weights(self, weights_path: str):
|
509 |
+
print(f"Loading Text2Semantic weights from {weights_path}")
|
510 |
+
self.configs.t2s_weights_path = weights_path
|
511 |
+
self.configs.save_configs()
|
512 |
+
self.configs.hz = 50
|
513 |
+
dict_s1 = torch.load(weights_path, map_location=self.configs.device)
|
514 |
+
config = dict_s1["config"]
|
515 |
+
self.configs.max_sec = config["data"]["max_sec"]
|
516 |
+
t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
|
517 |
+
t2s_model.load_state_dict(dict_s1["weight"])
|
518 |
+
t2s_model = t2s_model.to(self.configs.device)
|
519 |
+
t2s_model = t2s_model.eval()
|
520 |
+
self.t2s_model = t2s_model
|
521 |
+
if self.configs.is_half and str(self.configs.device) != "cpu":
|
522 |
+
self.t2s_model = self.t2s_model.half()
|
523 |
+
|
524 |
+
def init_bigvgan(self):
|
525 |
+
if self.bigvgan_model is not None:
|
526 |
+
return
|
527 |
+
self.bigvgan_model = BigVGAN.from_pretrained(
|
528 |
+
"%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,),
|
529 |
+
use_cuda_kernel=False,
|
530 |
+
) # if True, RuntimeError: Ninja is required to load C++ extensions
|
531 |
+
# remove weight norm in the model and set to eval mode
|
532 |
+
self.bigvgan_model.remove_weight_norm()
|
533 |
+
self.bigvgan_model = self.bigvgan_model.eval()
|
534 |
+
if self.configs.is_half == True:
|
535 |
+
self.bigvgan_model = self.bigvgan_model.half().to(self.configs.device)
|
536 |
+
else:
|
537 |
+
self.bigvgan_model = self.bigvgan_model.to(self.configs.device)
|
538 |
+
|
539 |
+
def init_sr_model(self):
|
540 |
+
if self.sr_model is not None:
|
541 |
+
return
|
542 |
+
try:
|
543 |
+
self.sr_model: AP_BWE = AP_BWE(self.configs.device, DictToAttrRecursive)
|
544 |
+
self.sr_model_not_exist = False
|
545 |
+
except FileNotFoundError:
|
546 |
+
print(i18n("你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载好"))
|
547 |
+
self.sr_model_not_exist = True
|
548 |
+
|
549 |
+
def enable_half_precision(self, enable: bool = True, save: bool = True):
|
550 |
+
"""
|
551 |
+
To enable half precision for the TTS model.
|
552 |
+
Args:
|
553 |
+
enable: bool, whether to enable half precision.
|
554 |
+
|
555 |
+
"""
|
556 |
+
if str(self.configs.device) == "cpu" and enable:
|
557 |
+
print("Half precision is not supported on CPU.")
|
558 |
+
return
|
559 |
+
|
560 |
+
self.configs.is_half = enable
|
561 |
+
self.precision = torch.float16 if enable else torch.float32
|
562 |
+
if save:
|
563 |
+
self.configs.save_configs()
|
564 |
+
if enable:
|
565 |
+
if self.t2s_model is not None:
|
566 |
+
self.t2s_model = self.t2s_model.half()
|
567 |
+
if self.vits_model is not None:
|
568 |
+
self.vits_model = self.vits_model.half()
|
569 |
+
if self.bert_model is not None:
|
570 |
+
self.bert_model = self.bert_model.half()
|
571 |
+
if self.cnhuhbert_model is not None:
|
572 |
+
self.cnhuhbert_model = self.cnhuhbert_model.half()
|
573 |
+
if self.bigvgan_model is not None:
|
574 |
+
self.bigvgan_model = self.bigvgan_model.half()
|
575 |
+
else:
|
576 |
+
if self.t2s_model is not None:
|
577 |
+
self.t2s_model = self.t2s_model.float()
|
578 |
+
if self.vits_model is not None:
|
579 |
+
self.vits_model = self.vits_model.float()
|
580 |
+
if self.bert_model is not None:
|
581 |
+
self.bert_model = self.bert_model.float()
|
582 |
+
if self.cnhuhbert_model is not None:
|
583 |
+
self.cnhuhbert_model = self.cnhuhbert_model.float()
|
584 |
+
if self.bigvgan_model is not None:
|
585 |
+
self.bigvgan_model = self.bigvgan_model.float()
|
586 |
+
|
587 |
+
def set_device(self, device: torch.device, save: bool = True):
|
588 |
+
"""
|
589 |
+
To set the device for all models.
|
590 |
+
Args:
|
591 |
+
device: torch.device, the device to use for all models.
|
592 |
+
"""
|
593 |
+
self.configs.device = device
|
594 |
+
if save:
|
595 |
+
self.configs.save_configs()
|
596 |
+
if self.t2s_model is not None:
|
597 |
+
self.t2s_model = self.t2s_model.to(device)
|
598 |
+
if self.vits_model is not None:
|
599 |
+
self.vits_model = self.vits_model.to(device)
|
600 |
+
if self.bert_model is not None:
|
601 |
+
self.bert_model = self.bert_model.to(device)
|
602 |
+
if self.cnhuhbert_model is not None:
|
603 |
+
self.cnhuhbert_model = self.cnhuhbert_model.to(device)
|
604 |
+
if self.bigvgan_model is not None:
|
605 |
+
self.bigvgan_model = self.bigvgan_model.to(device)
|
606 |
+
if self.sr_model is not None:
|
607 |
+
self.sr_model = self.sr_model.to(device)
|
608 |
+
|
609 |
+
def set_ref_audio(self, ref_audio_path: str):
|
610 |
+
"""
|
611 |
+
To set the reference audio for the TTS model,
|
612 |
+
including the prompt_semantic and refer_spepc.
|
613 |
+
Args:
|
614 |
+
ref_audio_path: str, the path of the reference audio.
|
615 |
+
"""
|
616 |
+
self._set_prompt_semantic(ref_audio_path)
|
617 |
+
self._set_ref_spec(ref_audio_path)
|
618 |
+
self._set_ref_audio_path(ref_audio_path)
|
619 |
+
|
620 |
+
def _set_ref_audio_path(self, ref_audio_path):
|
621 |
+
self.prompt_cache["ref_audio_path"] = ref_audio_path
|
622 |
+
|
623 |
+
def _set_ref_spec(self, ref_audio_path):
|
624 |
+
spec = self._get_ref_spec(ref_audio_path)
|
625 |
+
if self.prompt_cache["refer_spec"] in [[], None]:
|
626 |
+
self.prompt_cache["refer_spec"] = [spec]
|
627 |
+
else:
|
628 |
+
self.prompt_cache["refer_spec"][0] = spec
|
629 |
+
|
630 |
+
def _get_ref_spec(self, ref_audio_path):
|
631 |
+
raw_audio, raw_sr = torchaudio.load(ref_audio_path)
|
632 |
+
raw_audio = raw_audio.to(self.configs.device).float()
|
633 |
+
self.prompt_cache["raw_audio"] = raw_audio
|
634 |
+
self.prompt_cache["raw_sr"] = raw_sr
|
635 |
+
|
636 |
+
audio = load_audio(ref_audio_path, int(self.configs.sampling_rate))
|
637 |
+
audio = torch.FloatTensor(audio)
|
638 |
+
maxx = audio.abs().max()
|
639 |
+
if maxx > 1:
|
640 |
+
audio /= min(2, maxx)
|
641 |
+
audio_norm = audio
|
642 |
+
audio_norm = audio_norm.unsqueeze(0)
|
643 |
+
spec = spectrogram_torch(
|
644 |
+
audio_norm,
|
645 |
+
self.configs.filter_length,
|
646 |
+
self.configs.sampling_rate,
|
647 |
+
self.configs.hop_length,
|
648 |
+
self.configs.win_length,
|
649 |
+
center=False,
|
650 |
+
)
|
651 |
+
spec = spec.to(self.configs.device)
|
652 |
+
if self.configs.is_half:
|
653 |
+
spec = spec.half()
|
654 |
+
return spec
|
655 |
+
|
656 |
+
def _set_prompt_semantic(self, ref_wav_path: str):
|
657 |
+
zero_wav = np.zeros(
|
658 |
+
int(self.configs.sampling_rate * 0.3),
|
659 |
+
dtype=np.float16 if self.configs.is_half else np.float32,
|
660 |
+
)
|
661 |
+
with torch.no_grad():
|
662 |
+
wav16k, sr = librosa.load(ref_wav_path, sr=16000)
|
663 |
+
if wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000:
|
664 |
+
raise OSError(i18n("参考音频在3~10秒范围外,请更换!"))
|
665 |
+
wav16k = torch.from_numpy(wav16k)
|
666 |
+
zero_wav_torch = torch.from_numpy(zero_wav)
|
667 |
+
wav16k = wav16k.to(self.configs.device)
|
668 |
+
zero_wav_torch = zero_wav_torch.to(self.configs.device)
|
669 |
+
if self.configs.is_half:
|
670 |
+
wav16k = wav16k.half()
|
671 |
+
zero_wav_torch = zero_wav_torch.half()
|
672 |
+
|
673 |
+
wav16k = torch.cat([wav16k, zero_wav_torch])
|
674 |
+
hubert_feature = self.cnhuhbert_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(
|
675 |
+
1, 2
|
676 |
+
) # .float()
|
677 |
+
codes = self.vits_model.extract_latent(hubert_feature)
|
678 |
+
|
679 |
+
prompt_semantic = codes[0, 0].to(self.configs.device)
|
680 |
+
self.prompt_cache["prompt_semantic"] = prompt_semantic
|
681 |
+
|
682 |
+
def batch_sequences(self, sequences: List[torch.Tensor], axis: int = 0, pad_value: int = 0, max_length: int = None):
|
683 |
+
seq = sequences[0]
|
684 |
+
ndim = seq.dim()
|
685 |
+
if axis < 0:
|
686 |
+
axis += ndim
|
687 |
+
dtype: torch.dtype = seq.dtype
|
688 |
+
pad_value = torch.tensor(pad_value, dtype=dtype)
|
689 |
+
seq_lengths = [seq.shape[axis] for seq in sequences]
|
690 |
+
if max_length is None:
|
691 |
+
max_length = max(seq_lengths)
|
692 |
+
else:
|
693 |
+
max_length = max(seq_lengths) if max_length < max(seq_lengths) else max_length
|
694 |
+
|
695 |
+
padded_sequences = []
|
696 |
+
for seq, length in zip(sequences, seq_lengths):
|
697 |
+
padding = [0] * axis + [0, max_length - length] + [0] * (ndim - axis - 1)
|
698 |
+
padded_seq = torch.nn.functional.pad(seq, padding, value=pad_value)
|
699 |
+
padded_sequences.append(padded_seq)
|
700 |
+
batch = torch.stack(padded_sequences)
|
701 |
+
return batch
|
702 |
+
|
703 |
+
def to_batch(
|
704 |
+
self,
|
705 |
+
data: list,
|
706 |
+
prompt_data: dict = None,
|
707 |
+
batch_size: int = 5,
|
708 |
+
threshold: float = 0.75,
|
709 |
+
split_bucket: bool = True,
|
710 |
+
device: torch.device = torch.device("cpu"),
|
711 |
+
precision: torch.dtype = torch.float32,
|
712 |
+
):
|
713 |
+
_data: list = []
|
714 |
+
index_and_len_list = []
|
715 |
+
for idx, item in enumerate(data):
|
716 |
+
norm_text_len = len(item["norm_text"])
|
717 |
+
index_and_len_list.append([idx, norm_text_len])
|
718 |
+
|
719 |
+
batch_index_list = []
|
720 |
+
if split_bucket:
|
721 |
+
index_and_len_list.sort(key=lambda x: x[1])
|
722 |
+
index_and_len_list = np.array(index_and_len_list, dtype=np.int64)
|
723 |
+
|
724 |
+
batch_index_list_len = 0
|
725 |
+
pos = 0
|
726 |
+
while pos < index_and_len_list.shape[0]:
|
727 |
+
# batch_index_list.append(index_and_len_list[pos:min(pos+batch_size,len(index_and_len_list))])
|
728 |
+
pos_end = min(pos + batch_size, index_and_len_list.shape[0])
|
729 |
+
while pos < pos_end:
|
730 |
+
batch = index_and_len_list[pos:pos_end, 1].astype(np.float32)
|
731 |
+
score = batch[(pos_end - pos) // 2] / (batch.mean() + 1e-8)
|
732 |
+
if (score >= threshold) or (pos_end - pos == 1):
|
733 |
+
batch_index = index_and_len_list[pos:pos_end, 0].tolist()
|
734 |
+
batch_index_list_len += len(batch_index)
|
735 |
+
batch_index_list.append(batch_index)
|
736 |
+
pos = pos_end
|
737 |
+
break
|
738 |
+
pos_end = pos_end - 1
|
739 |
+
|
740 |
+
assert batch_index_list_len == len(data)
|
741 |
+
|
742 |
+
else:
|
743 |
+
for i in range(len(data)):
|
744 |
+
if i % batch_size == 0:
|
745 |
+
batch_index_list.append([])
|
746 |
+
batch_index_list[-1].append(i)
|
747 |
+
|
748 |
+
for batch_idx, index_list in enumerate(batch_index_list):
|
749 |
+
item_list = [data[idx] for idx in index_list]
|
750 |
+
phones_list = []
|
751 |
+
phones_len_list = []
|
752 |
+
# bert_features_list = []
|
753 |
+
all_phones_list = []
|
754 |
+
all_phones_len_list = []
|
755 |
+
all_bert_features_list = []
|
756 |
+
norm_text_batch = []
|
757 |
+
all_bert_max_len = 0
|
758 |
+
all_phones_max_len = 0
|
759 |
+
for item in item_list:
|
760 |
+
if prompt_data is not None:
|
761 |
+
all_bert_features = torch.cat([prompt_data["bert_features"], item["bert_features"]], 1).to(
|
762 |
+
dtype=precision, device=device
|
763 |
+
)
|
764 |
+
all_phones = torch.LongTensor(prompt_data["phones"] + item["phones"]).to(device)
|
765 |
+
phones = torch.LongTensor(item["phones"]).to(device)
|
766 |
+
# norm_text = prompt_data["norm_text"]+item["norm_text"]
|
767 |
+
else:
|
768 |
+
all_bert_features = item["bert_features"].to(dtype=precision, device=device)
|
769 |
+
phones = torch.LongTensor(item["phones"]).to(device)
|
770 |
+
all_phones = phones
|
771 |
+
# norm_text = item["norm_text"]
|
772 |
+
|
773 |
+
all_bert_max_len = max(all_bert_max_len, all_bert_features.shape[-1])
|
774 |
+
all_phones_max_len = max(all_phones_max_len, all_phones.shape[-1])
|
775 |
+
|
776 |
+
phones_list.append(phones)
|
777 |
+
phones_len_list.append(phones.shape[-1])
|
778 |
+
all_phones_list.append(all_phones)
|
779 |
+
all_phones_len_list.append(all_phones.shape[-1])
|
780 |
+
all_bert_features_list.append(all_bert_features)
|
781 |
+
norm_text_batch.append(item["norm_text"])
|
782 |
+
|
783 |
+
phones_batch = phones_list
|
784 |
+
all_phones_batch = all_phones_list
|
785 |
+
all_bert_features_batch = all_bert_features_list
|
786 |
+
|
787 |
+
max_len = max(all_bert_max_len, all_phones_max_len)
|
788 |
+
# phones_batch = self.batch_sequences(phones_list, axis=0, pad_value=0, max_length=max_len)
|
789 |
+
#### 直接对phones和bert_features进行pad。(padding策略会影响T2S模型生成的结果,但不直接影响复读概率。影响复读概率的主要因素是mask的策略)
|
790 |
+
# all_phones_batch = self.batch_sequences(all_phones_list, axis=0, pad_value=0, max_length=max_len)
|
791 |
+
# all_bert_features_batch = all_bert_features_list
|
792 |
+
# all_bert_features_batch = torch.zeros((len(all_bert_features_list), 1024, max_len), dtype=precision, device=device)
|
793 |
+
# for idx, item in enumerate(all_bert_features_list):
|
794 |
+
# all_bert_features_batch[idx, :, : item.shape[-1]] = item
|
795 |
+
|
796 |
+
# #### 先对phones进行embedding、对bert_features进行project,再pad到相同长度,(padding策略会影响T2S模型生成的结果,但不直接影响复读概率。影响复读概率的主要因素是mask的策略)
|
797 |
+
# all_phones_list = [self.t2s_model.model.ar_text_embedding(item.to(self.t2s_model.device)) for item in all_phones_list]
|
798 |
+
# all_phones_list = [F.pad(item,(0,0,0,max_len-item.shape[0]),value=0) for item in all_phones_list]
|
799 |
+
# all_phones_batch = torch.stack(all_phones_list, dim=0)
|
800 |
+
|
801 |
+
# all_bert_features_list = [self.t2s_model.model.bert_proj(item.to(self.t2s_model.device).transpose(0, 1)) for item in all_bert_features_list]
|
802 |
+
# all_bert_features_list = [F.pad(item,(0,0,0,max_len-item.shape[0]), value=0) for item in all_bert_features_list]
|
803 |
+
# all_bert_features_batch = torch.stack(all_bert_features_list, dim=0)
|
804 |
+
|
805 |
+
batch = {
|
806 |
+
"phones": phones_batch,
|
807 |
+
"phones_len": torch.LongTensor(phones_len_list).to(device),
|
808 |
+
"all_phones": all_phones_batch,
|
809 |
+
"all_phones_len": torch.LongTensor(all_phones_len_list).to(device),
|
810 |
+
"all_bert_features": all_bert_features_batch,
|
811 |
+
"norm_text": norm_text_batch,
|
812 |
+
"max_len": max_len,
|
813 |
+
}
|
814 |
+
_data.append(batch)
|
815 |
+
|
816 |
+
return _data, batch_index_list
|
817 |
+
|
818 |
+
def recovery_order(self, data: list, batch_index_list: list) -> list:
|
819 |
+
"""
|
820 |
+
Recovery the order of the audio according to the batch_index_list.
|
821 |
+
|
822 |
+
Args:
|
823 |
+
data (List[list(torch.Tensor)]): the out of order audio .
|
824 |
+
batch_index_list (List[list[int]]): the batch index list.
|
825 |
+
|
826 |
+
Returns:
|
827 |
+
list (List[torch.Tensor]): the data in the original order.
|
828 |
+
"""
|
829 |
+
length = len(sum(batch_index_list, []))
|
830 |
+
_data = [None] * length
|
831 |
+
for i, index_list in enumerate(batch_index_list):
|
832 |
+
for j, index in enumerate(index_list):
|
833 |
+
_data[index] = data[i][j]
|
834 |
+
return _data
|
835 |
+
|
836 |
+
def stop(
|
837 |
+
self,
|
838 |
+
):
|
839 |
+
"""
|
840 |
+
Stop the inference process.
|
841 |
+
"""
|
842 |
+
self.stop_flag = True
|
843 |
+
|
844 |
+
@torch.no_grad()
|
845 |
+
def run(self, inputs: dict):
|
846 |
+
"""
|
847 |
+
Text to speech inference.
|
848 |
+
|
849 |
+
Args:
|
850 |
+
inputs (dict):
|
851 |
+
{
|
852 |
+
"text": "", # str.(required) text to be synthesized
|
853 |
+
"text_lang: "", # str.(required) language of the text to be synthesized
|
854 |
+
"ref_audio_path": "", # str.(required) reference audio path
|
855 |
+
"aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
|
856 |
+
"prompt_text": "", # str.(optional) prompt text for the reference audio
|
857 |
+
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio
|
858 |
+
"top_k": 5, # int. top k sampling
|
859 |
+
"top_p": 1, # float. top p sampling
|
860 |
+
"temperature": 1, # float. temperature for sampling
|
861 |
+
"text_split_method": "cut0", # str. text split method, see text_segmentation_method.py for details.
|
862 |
+
"batch_size": 1, # int. batch size for inference
|
863 |
+
"batch_threshold": 0.75, # float. threshold for batch splitting.
|
864 |
+
"split_bucket: True, # bool. whether to split the batch into multiple buckets.
|
865 |
+
"return_fragment": False, # bool. step by step return the audio fragment.
|
866 |
+
"speed_factor":1.0, # float. control the speed of the synthesized audio.
|
867 |
+
"fragment_interval":0.3, # float. to control the interval of the audio fragment.
|
868 |
+
"seed": -1, # int. random seed for reproducibility.
|
869 |
+
"parallel_infer": True, # bool. whether to use parallel inference.
|
870 |
+
"repetition_penalty": 1.35 # float. repetition penalty for T2S model.
|
871 |
+
"sample_steps": 32, # int. number of sampling steps for VITS model V3.
|
872 |
+
"super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3.
|
873 |
+
}
|
874 |
+
returns:
|
875 |
+
Tuple[int, np.ndarray]: sampling rate and audio data.
|
876 |
+
"""
|
877 |
+
########## variables initialization ###########
|
878 |
+
self.stop_flag: bool = False
|
879 |
+
text: str = inputs.get("text", "")
|
880 |
+
text_lang: str = inputs.get("text_lang", "")
|
881 |
+
ref_audio_path: str = inputs.get("ref_audio_path", "")
|
882 |
+
aux_ref_audio_paths: list = inputs.get("aux_ref_audio_paths", [])
|
883 |
+
prompt_text: str = inputs.get("prompt_text", "")
|
884 |
+
prompt_lang: str = inputs.get("prompt_lang", "")
|
885 |
+
top_k: int = inputs.get("top_k", 5)
|
886 |
+
top_p: float = inputs.get("top_p", 1)
|
887 |
+
temperature: float = inputs.get("temperature", 1)
|
888 |
+
text_split_method: str = inputs.get("text_split_method", "cut0")
|
889 |
+
batch_size = inputs.get("batch_size", 1)
|
890 |
+
batch_threshold = inputs.get("batch_threshold", 0.75)
|
891 |
+
speed_factor = inputs.get("speed_factor", 1.0)
|
892 |
+
split_bucket = inputs.get("split_bucket", True)
|
893 |
+
return_fragment = inputs.get("return_fragment", False)
|
894 |
+
fragment_interval = inputs.get("fragment_interval", 0.3)
|
895 |
+
seed = inputs.get("seed", -1)
|
896 |
+
seed = -1 if seed in ["", None] else seed
|
897 |
+
actual_seed = set_seed(seed)
|
898 |
+
parallel_infer = inputs.get("parallel_infer", True)
|
899 |
+
repetition_penalty = inputs.get("repetition_penalty", 1.35)
|
900 |
+
sample_steps = inputs.get("sample_steps", 32)
|
901 |
+
super_sampling = inputs.get("super_sampling", False)
|
902 |
+
|
903 |
+
if parallel_infer:
|
904 |
+
print(i18n("并行推理模式已开启"))
|
905 |
+
self.t2s_model.model.infer_panel = self.t2s_model.model.infer_panel_batch_infer
|
906 |
+
else:
|
907 |
+
print(i18n("并行推理模式已关闭"))
|
908 |
+
self.t2s_model.model.infer_panel = self.t2s_model.model.infer_panel_naive_batched
|
909 |
+
|
910 |
+
if return_fragment:
|
911 |
+
print(i18n("分段返回模式已开启"))
|
912 |
+
if split_bucket:
|
913 |
+
split_bucket = False
|
914 |
+
print(i18n("分段返回模式不支持分桶处理,已自动关闭分桶处理"))
|
915 |
+
|
916 |
+
if split_bucket and speed_factor == 1.0 and not (self.configs.is_v3_synthesizer and parallel_infer):
|
917 |
+
print(i18n("分桶处理模式已开启"))
|
918 |
+
elif speed_factor != 1.0:
|
919 |
+
print(i18n("语速调节不支持分桶处理,已自动关闭分桶处理"))
|
920 |
+
split_bucket = False
|
921 |
+
elif self.configs.is_v3_synthesizer and parallel_infer:
|
922 |
+
print(i18n("当开启并行推理模式时,SoVits V3模型不支持分桶处理,已自动关闭分桶处理"))
|
923 |
+
split_bucket = False
|
924 |
+
else:
|
925 |
+
print(i18n("分桶处理模式已关闭"))
|
926 |
+
|
927 |
+
if fragment_interval < 0.01:
|
928 |
+
fragment_interval = 0.01
|
929 |
+
print(i18n("分段间隔过小,���自动设置为0.01"))
|
930 |
+
|
931 |
+
no_prompt_text = False
|
932 |
+
if prompt_text in [None, ""]:
|
933 |
+
no_prompt_text = True
|
934 |
+
|
935 |
+
assert text_lang in self.configs.languages
|
936 |
+
if not no_prompt_text:
|
937 |
+
assert prompt_lang in self.configs.languages
|
938 |
+
|
939 |
+
if no_prompt_text and self.configs.is_v3_synthesizer:
|
940 |
+
raise NO_PROMPT_ERROR("prompt_text cannot be empty when using SoVITS_V3")
|
941 |
+
|
942 |
+
if ref_audio_path in [None, ""] and (
|
943 |
+
(self.prompt_cache["prompt_semantic"] is None) or (self.prompt_cache["refer_spec"] in [None, []])
|
944 |
+
):
|
945 |
+
raise ValueError(
|
946 |
+
"ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()"
|
947 |
+
)
|
948 |
+
|
949 |
+
###### setting reference audio and prompt text preprocessing ########
|
950 |
+
t0 = time.perf_counter()
|
951 |
+
if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"]):
|
952 |
+
if not os.path.exists(ref_audio_path):
|
953 |
+
raise ValueError(f"{ref_audio_path} not exists")
|
954 |
+
self.set_ref_audio(ref_audio_path)
|
955 |
+
|
956 |
+
aux_ref_audio_paths = aux_ref_audio_paths if aux_ref_audio_paths is not None else []
|
957 |
+
paths = set(aux_ref_audio_paths) & set(self.prompt_cache["aux_ref_audio_paths"])
|
958 |
+
if not (len(list(paths)) == len(aux_ref_audio_paths) == len(self.prompt_cache["aux_ref_audio_paths"])):
|
959 |
+
self.prompt_cache["aux_ref_audio_paths"] = aux_ref_audio_paths
|
960 |
+
self.prompt_cache["refer_spec"] = [self.prompt_cache["refer_spec"][0]]
|
961 |
+
for path in aux_ref_audio_paths:
|
962 |
+
if path in [None, ""]:
|
963 |
+
continue
|
964 |
+
if not os.path.exists(path):
|
965 |
+
print(i18n("音频文件不存在,跳过:"), path)
|
966 |
+
continue
|
967 |
+
self.prompt_cache["refer_spec"].append(self._get_ref_spec(path))
|
968 |
+
|
969 |
+
if not no_prompt_text:
|
970 |
+
prompt_text = prompt_text.strip("\n")
|
971 |
+
if prompt_text[-1] not in splits:
|
972 |
+
prompt_text += "。" if prompt_lang != "en" else "."
|
973 |
+
print(i18n("实际输入的参考文本:"), prompt_text)
|
974 |
+
if self.prompt_cache["prompt_text"] != prompt_text:
|
975 |
+
phones, bert_features, norm_text = self.text_preprocessor.segment_and_extract_feature_for_text(
|
976 |
+
prompt_text, prompt_lang, self.configs.version
|
977 |
+
)
|
978 |
+
self.prompt_cache["prompt_text"] = prompt_text
|
979 |
+
self.prompt_cache["prompt_lang"] = prompt_lang
|
980 |
+
self.prompt_cache["phones"] = phones
|
981 |
+
self.prompt_cache["bert_features"] = bert_features
|
982 |
+
self.prompt_cache["norm_text"] = norm_text
|
983 |
+
|
984 |
+
###### text preprocessing ########
|
985 |
+
t1 = time.perf_counter()
|
986 |
+
data: list = None
|
987 |
+
if not return_fragment:
|
988 |
+
data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version)
|
989 |
+
if len(data) == 0:
|
990 |
+
yield 16000, np.zeros(int(16000), dtype=np.int16)
|
991 |
+
return
|
992 |
+
|
993 |
+
batch_index_list: list = None
|
994 |
+
data, batch_index_list = self.to_batch(
|
995 |
+
data,
|
996 |
+
prompt_data=self.prompt_cache if not no_prompt_text else None,
|
997 |
+
batch_size=batch_size,
|
998 |
+
threshold=batch_threshold,
|
999 |
+
split_bucket=split_bucket,
|
1000 |
+
device=self.configs.device,
|
1001 |
+
precision=self.precision,
|
1002 |
+
)
|
1003 |
+
else:
|
1004 |
+
print(f"############ {i18n('切分文本')} ############")
|
1005 |
+
texts = self.text_preprocessor.pre_seg_text(text, text_lang, text_split_method)
|
1006 |
+
data = []
|
1007 |
+
for i in range(len(texts)):
|
1008 |
+
if i % batch_size == 0:
|
1009 |
+
data.append([])
|
1010 |
+
data[-1].append(texts[i])
|
1011 |
+
|
1012 |
+
def make_batch(batch_texts):
|
1013 |
+
batch_data = []
|
1014 |
+
print(f"############ {i18n('提取文本Bert特征')} ############")
|
1015 |
+
for text in tqdm(batch_texts):
|
1016 |
+
phones, bert_features, norm_text = self.text_preprocessor.segment_and_extract_feature_for_text(
|
1017 |
+
text, text_lang, self.configs.version
|
1018 |
+
)
|
1019 |
+
if phones is None:
|
1020 |
+
continue
|
1021 |
+
res = {
|
1022 |
+
"phones": phones,
|
1023 |
+
"bert_features": bert_features,
|
1024 |
+
"norm_text": norm_text,
|
1025 |
+
}
|
1026 |
+
batch_data.append(res)
|
1027 |
+
if len(batch_data) == 0:
|
1028 |
+
return None
|
1029 |
+
batch, _ = self.to_batch(
|
1030 |
+
batch_data,
|
1031 |
+
prompt_data=self.prompt_cache if not no_prompt_text else None,
|
1032 |
+
batch_size=batch_size,
|
1033 |
+
threshold=batch_threshold,
|
1034 |
+
split_bucket=False,
|
1035 |
+
device=self.configs.device,
|
1036 |
+
precision=self.precision,
|
1037 |
+
)
|
1038 |
+
return batch[0]
|
1039 |
+
|
1040 |
+
t2 = time.perf_counter()
|
1041 |
+
try:
|
1042 |
+
print("############ 推理 ############")
|
1043 |
+
###### inference ######
|
1044 |
+
t_34 = 0.0
|
1045 |
+
t_45 = 0.0
|
1046 |
+
audio = []
|
1047 |
+
output_sr = self.configs.sampling_rate if not self.configs.is_v3_synthesizer else 24000
|
1048 |
+
for item in data:
|
1049 |
+
t3 = time.perf_counter()
|
1050 |
+
if return_fragment:
|
1051 |
+
item = make_batch(item)
|
1052 |
+
if item is None:
|
1053 |
+
continue
|
1054 |
+
|
1055 |
+
batch_phones: List[torch.LongTensor] = item["phones"]
|
1056 |
+
# batch_phones:torch.LongTensor = item["phones"]
|
1057 |
+
batch_phones_len: torch.LongTensor = item["phones_len"]
|
1058 |
+
all_phoneme_ids: torch.LongTensor = item["all_phones"]
|
1059 |
+
all_phoneme_lens: torch.LongTensor = item["all_phones_len"]
|
1060 |
+
all_bert_features: torch.LongTensor = item["all_bert_features"]
|
1061 |
+
norm_text: str = item["norm_text"]
|
1062 |
+
max_len = item["max_len"]
|
1063 |
+
|
1064 |
+
print(i18n("前端处理后的文本(每句):"), norm_text)
|
1065 |
+
if no_prompt_text:
|
1066 |
+
prompt = None
|
1067 |
+
else:
|
1068 |
+
prompt = (
|
1069 |
+
self.prompt_cache["prompt_semantic"].expand(len(all_phoneme_ids), -1).to(self.configs.device)
|
1070 |
+
)
|
1071 |
+
|
1072 |
+
print(f"############ {i18n('预测语义Token')} ############")
|
1073 |
+
pred_semantic_list, idx_list = self.t2s_model.model.infer_panel(
|
1074 |
+
all_phoneme_ids,
|
1075 |
+
all_phoneme_lens,
|
1076 |
+
prompt,
|
1077 |
+
all_bert_features,
|
1078 |
+
# prompt_phone_len=ph_offset,
|
1079 |
+
top_k=top_k,
|
1080 |
+
top_p=top_p,
|
1081 |
+
temperature=temperature,
|
1082 |
+
early_stop_num=self.configs.hz * self.configs.max_sec,
|
1083 |
+
max_len=max_len,
|
1084 |
+
repetition_penalty=repetition_penalty,
|
1085 |
+
)
|
1086 |
+
t4 = time.perf_counter()
|
1087 |
+
t_34 += t4 - t3
|
1088 |
+
|
1089 |
+
refer_audio_spec: torch.Tensor = [
|
1090 |
+
item.to(dtype=self.precision, device=self.configs.device)
|
1091 |
+
for item in self.prompt_cache["refer_spec"]
|
1092 |
+
]
|
1093 |
+
|
1094 |
+
batch_audio_fragment = []
|
1095 |
+
|
1096 |
+
# ## vits并行推理 method 1
|
1097 |
+
# pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)]
|
1098 |
+
# pred_semantic_len = torch.LongTensor([item.shape[0] for item in pred_semantic_list]).to(self.configs.device)
|
1099 |
+
# pred_semantic = self.batch_sequences(pred_semantic_list, axis=0, pad_value=0).unsqueeze(0)
|
1100 |
+
# max_len = 0
|
1101 |
+
# for i in range(0, len(batch_phones)):
|
1102 |
+
# max_len = max(max_len, batch_phones[i].shape[-1])
|
1103 |
+
# batch_phones = self.batch_sequences(batch_phones, axis=0, pad_value=0, max_length=max_len)
|
1104 |
+
# batch_phones = batch_phones.to(self.configs.device)
|
1105 |
+
# batch_audio_fragment = (self.vits_model.batched_decode(
|
1106 |
+
# pred_semantic, pred_semantic_len, batch_phones, batch_phones_len,refer_audio_spec
|
1107 |
+
# ))
|
1108 |
+
print(f"############ {i18n('合成音频')} ############")
|
1109 |
+
if not self.configs.is_v3_synthesizer:
|
1110 |
+
if speed_factor == 1.0:
|
1111 |
+
print(f"{i18n('并行合成中')}...")
|
1112 |
+
# ## vits并行推理 method 2
|
1113 |
+
pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)]
|
1114 |
+
upsample_rate = math.prod(self.vits_model.upsample_rates)
|
1115 |
+
audio_frag_idx = [
|
1116 |
+
pred_semantic_list[i].shape[0] * 2 * upsample_rate
|
1117 |
+
for i in range(0, len(pred_semantic_list))
|
1118 |
+
]
|
1119 |
+
audio_frag_end_idx = [sum(audio_frag_idx[: i + 1]) for i in range(0, len(audio_frag_idx))]
|
1120 |
+
all_pred_semantic = (
|
1121 |
+
torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device)
|
1122 |
+
)
|
1123 |
+
_batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device)
|
1124 |
+
_batch_audio_fragment = self.vits_model.decode(
|
1125 |
+
all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor
|
1126 |
+
).detach()[0, 0, :]
|
1127 |
+
audio_frag_end_idx.insert(0, 0)
|
1128 |
+
batch_audio_fragment = [
|
1129 |
+
_batch_audio_fragment[audio_frag_end_idx[i - 1] : audio_frag_end_idx[i]]
|
1130 |
+
for i in range(1, len(audio_frag_end_idx))
|
1131 |
+
]
|
1132 |
+
else:
|
1133 |
+
# ## vits串行推理
|
1134 |
+
for i, idx in enumerate(tqdm(idx_list)):
|
1135 |
+
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
|
1136 |
+
_pred_semantic = (
|
1137 |
+
pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)
|
1138 |
+
) # .unsqueeze(0)#mq要多unsqueeze一次
|
1139 |
+
audio_fragment = self.vits_model.decode(
|
1140 |
+
_pred_semantic, phones, refer_audio_spec, speed=speed_factor
|
1141 |
+
).detach()[0, 0, :]
|
1142 |
+
batch_audio_fragment.append(audio_fragment) ###试试重建不带上prompt部分
|
1143 |
+
else:
|
1144 |
+
if parallel_infer:
|
1145 |
+
print(f"{i18n('并行合成中')}...")
|
1146 |
+
audio_fragments = self.v3_synthesis_batched_infer(
|
1147 |
+
idx_list, pred_semantic_list, batch_phones, speed=speed_factor, sample_steps=sample_steps
|
1148 |
+
)
|
1149 |
+
batch_audio_fragment.extend(audio_fragments)
|
1150 |
+
else:
|
1151 |
+
for i, idx in enumerate(tqdm(idx_list)):
|
1152 |
+
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
|
1153 |
+
_pred_semantic = (
|
1154 |
+
pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)
|
1155 |
+
) # .unsqueeze(0)#mq要多unsqueeze一次
|
1156 |
+
audio_fragment = self.v3_synthesis(
|
1157 |
+
_pred_semantic, phones, speed=speed_factor, sample_steps=sample_steps
|
1158 |
+
)
|
1159 |
+
batch_audio_fragment.append(audio_fragment)
|
1160 |
+
|
1161 |
+
t5 = time.perf_counter()
|
1162 |
+
t_45 += t5 - t4
|
1163 |
+
if return_fragment:
|
1164 |
+
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4))
|
1165 |
+
yield self.audio_postprocess(
|
1166 |
+
[batch_audio_fragment],
|
1167 |
+
output_sr,
|
1168 |
+
None,
|
1169 |
+
speed_factor,
|
1170 |
+
False,
|
1171 |
+
fragment_interval,
|
1172 |
+
super_sampling if self.configs.is_v3_synthesizer else False,
|
1173 |
+
)
|
1174 |
+
else:
|
1175 |
+
audio.append(batch_audio_fragment)
|
1176 |
+
|
1177 |
+
if self.stop_flag:
|
1178 |
+
yield 16000, np.zeros(int(16000), dtype=np.int16)
|
1179 |
+
return
|
1180 |
+
|
1181 |
+
if not return_fragment:
|
1182 |
+
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t_34, t_45))
|
1183 |
+
if len(audio) == 0:
|
1184 |
+
yield 16000, np.zeros(int(16000), dtype=np.int16)
|
1185 |
+
return
|
1186 |
+
yield self.audio_postprocess(
|
1187 |
+
audio,
|
1188 |
+
output_sr,
|
1189 |
+
batch_index_list,
|
1190 |
+
speed_factor,
|
1191 |
+
split_bucket,
|
1192 |
+
fragment_interval,
|
1193 |
+
super_sampling if self.configs.is_v3_synthesizer else False,
|
1194 |
+
)
|
1195 |
+
|
1196 |
+
except Exception as e:
|
1197 |
+
traceback.print_exc()
|
1198 |
+
# 必须返回一个空音频, 否则会导致显存不释放。
|
1199 |
+
yield 16000, np.zeros(int(16000), dtype=np.int16)
|
1200 |
+
# 重置模型, 否则会导致显存释放不完全。
|
1201 |
+
del self.t2s_model
|
1202 |
+
del self.vits_model
|
1203 |
+
self.t2s_model = None
|
1204 |
+
self.vits_model = None
|
1205 |
+
self.init_t2s_weights(self.configs.t2s_weights_path)
|
1206 |
+
self.init_vits_weights(self.configs.vits_weights_path)
|
1207 |
+
raise e
|
1208 |
+
finally:
|
1209 |
+
self.empty_cache()
|
1210 |
+
|
1211 |
+
def empty_cache(self):
|
1212 |
+
try:
|
1213 |
+
gc.collect() # 触发gc的垃圾回收。避免内存一直增长。
|
1214 |
+
if "cuda" in str(self.configs.device):
|
1215 |
+
torch.cuda.empty_cache()
|
1216 |
+
elif str(self.configs.device) == "mps":
|
1217 |
+
torch.mps.empty_cache()
|
1218 |
+
except:
|
1219 |
+
pass
|
1220 |
+
|
1221 |
+
def audio_postprocess(
|
1222 |
+
self,
|
1223 |
+
audio: List[torch.Tensor],
|
1224 |
+
sr: int,
|
1225 |
+
batch_index_list: list = None,
|
1226 |
+
speed_factor: float = 1.0,
|
1227 |
+
split_bucket: bool = True,
|
1228 |
+
fragment_interval: float = 0.3,
|
1229 |
+
super_sampling: bool = False,
|
1230 |
+
) -> Tuple[int, np.ndarray]:
|
1231 |
+
zero_wav = torch.zeros(
|
1232 |
+
int(self.configs.sampling_rate * fragment_interval), dtype=self.precision, device=self.configs.device
|
1233 |
+
)
|
1234 |
+
|
1235 |
+
for i, batch in enumerate(audio):
|
1236 |
+
for j, audio_fragment in enumerate(batch):
|
1237 |
+
max_audio = torch.abs(audio_fragment).max() # 简单防止16bit爆音
|
1238 |
+
if max_audio > 1:
|
1239 |
+
audio_fragment /= max_audio
|
1240 |
+
audio_fragment: torch.Tensor = torch.cat([audio_fragment, zero_wav], dim=0)
|
1241 |
+
audio[i][j] = audio_fragment
|
1242 |
+
|
1243 |
+
if split_bucket:
|
1244 |
+
audio = self.recovery_order(audio, batch_index_list)
|
1245 |
+
else:
|
1246 |
+
# audio = [item for batch in audio for item in batch]
|
1247 |
+
audio = sum(audio, [])
|
1248 |
+
|
1249 |
+
audio = torch.cat(audio, dim=0)
|
1250 |
+
|
1251 |
+
if super_sampling:
|
1252 |
+
print(f"############ {i18n('音频超采样')} ############")
|
1253 |
+
t1 = time.perf_counter()
|
1254 |
+
self.init_sr_model()
|
1255 |
+
if not self.sr_model_not_exist:
|
1256 |
+
audio, sr = self.sr_model(audio.unsqueeze(0), sr)
|
1257 |
+
max_audio = np.abs(audio).max()
|
1258 |
+
if max_audio > 1:
|
1259 |
+
audio /= max_audio
|
1260 |
+
t2 = time.perf_counter()
|
1261 |
+
print(f"超采样用时:{t2 - t1:.3f}s")
|
1262 |
+
else:
|
1263 |
+
audio = audio.cpu().numpy()
|
1264 |
+
|
1265 |
+
audio = (audio * 32768).astype(np.int16)
|
1266 |
+
|
1267 |
+
# try:
|
1268 |
+
# if speed_factor != 1.0:
|
1269 |
+
# audio = speed_change(audio, speed=speed_factor, sr=int(sr))
|
1270 |
+
# except Exception as e:
|
1271 |
+
# print(f"Failed to change speed of audio: \n{e}")
|
1272 |
+
|
1273 |
+
return sr, audio
|
1274 |
+
|
1275 |
+
def v3_synthesis(
|
1276 |
+
self, semantic_tokens: torch.Tensor, phones: torch.Tensor, speed: float = 1.0, sample_steps: int = 32
|
1277 |
+
):
|
1278 |
+
prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
|
1279 |
+
prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
|
1280 |
+
refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device)
|
1281 |
+
|
1282 |
+
fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
|
1283 |
+
ref_audio: torch.Tensor = self.prompt_cache["raw_audio"]
|
1284 |
+
ref_sr = self.prompt_cache["raw_sr"]
|
1285 |
+
ref_audio = ref_audio.to(self.configs.device).float()
|
1286 |
+
if ref_audio.shape[0] == 2:
|
1287 |
+
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
1288 |
+
if ref_sr != 24000:
|
1289 |
+
ref_audio = resample(ref_audio, ref_sr, self.configs.device)
|
1290 |
+
|
1291 |
+
mel2 = mel_fn(ref_audio)
|
1292 |
+
mel2 = norm_spec(mel2)
|
1293 |
+
T_min = min(mel2.shape[2], fea_ref.shape[2])
|
1294 |
+
mel2 = mel2[:, :, :T_min]
|
1295 |
+
fea_ref = fea_ref[:, :, :T_min]
|
1296 |
+
if T_min > 468:
|
1297 |
+
mel2 = mel2[:, :, -468:]
|
1298 |
+
fea_ref = fea_ref[:, :, -468:]
|
1299 |
+
T_min = 468
|
1300 |
+
chunk_len = 934 - T_min
|
1301 |
+
|
1302 |
+
mel2 = mel2.to(self.precision)
|
1303 |
+
fea_todo, ge = self.vits_model.decode_encp(semantic_tokens, phones, refer_audio_spec, ge, speed)
|
1304 |
+
|
1305 |
+
cfm_resss = []
|
1306 |
+
idx = 0
|
1307 |
+
while 1:
|
1308 |
+
fea_todo_chunk = fea_todo[:, :, idx : idx + chunk_len]
|
1309 |
+
if fea_todo_chunk.shape[-1] == 0:
|
1310 |
+
break
|
1311 |
+
idx += chunk_len
|
1312 |
+
fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1)
|
1313 |
+
|
1314 |
+
cfm_res = self.vits_model.cfm.inference(
|
1315 |
+
fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0
|
1316 |
+
)
|
1317 |
+
cfm_res = cfm_res[:, :, mel2.shape[2] :]
|
1318 |
+
|
1319 |
+
mel2 = cfm_res[:, :, -T_min:]
|
1320 |
+
fea_ref = fea_todo_chunk[:, :, -T_min:]
|
1321 |
+
|
1322 |
+
cfm_resss.append(cfm_res)
|
1323 |
+
cfm_res = torch.cat(cfm_resss, 2)
|
1324 |
+
cfm_res = denorm_spec(cfm_res)
|
1325 |
+
|
1326 |
+
with torch.inference_mode():
|
1327 |
+
wav_gen = self.bigvgan_model(cfm_res)
|
1328 |
+
audio = wav_gen[0][0] # .cpu().detach().numpy()
|
1329 |
+
|
1330 |
+
return audio
|
1331 |
+
|
1332 |
+
def v3_synthesis_batched_infer(
|
1333 |
+
self,
|
1334 |
+
idx_list: List[int],
|
1335 |
+
semantic_tokens_list: List[torch.Tensor],
|
1336 |
+
batch_phones: List[torch.Tensor],
|
1337 |
+
speed: float = 1.0,
|
1338 |
+
sample_steps: int = 32,
|
1339 |
+
) -> List[torch.Tensor]:
|
1340 |
+
prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
|
1341 |
+
prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
|
1342 |
+
refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device)
|
1343 |
+
|
1344 |
+
fea_ref, ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
|
1345 |
+
ref_audio: torch.Tensor = self.prompt_cache["raw_audio"]
|
1346 |
+
ref_sr = self.prompt_cache["raw_sr"]
|
1347 |
+
ref_audio = ref_audio.to(self.configs.device).float()
|
1348 |
+
if ref_audio.shape[0] == 2:
|
1349 |
+
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
1350 |
+
if ref_sr != 24000:
|
1351 |
+
ref_audio = resample(ref_audio, ref_sr, self.configs.device)
|
1352 |
+
|
1353 |
+
mel2 = mel_fn(ref_audio)
|
1354 |
+
mel2 = norm_spec(mel2)
|
1355 |
+
T_min = min(mel2.shape[2], fea_ref.shape[2])
|
1356 |
+
mel2 = mel2[:, :, :T_min]
|
1357 |
+
fea_ref = fea_ref[:, :, :T_min]
|
1358 |
+
if T_min > 468:
|
1359 |
+
mel2 = mel2[:, :, -468:]
|
1360 |
+
fea_ref = fea_ref[:, :, -468:]
|
1361 |
+
T_min = 468
|
1362 |
+
chunk_len = 934 - T_min
|
1363 |
+
|
1364 |
+
mel2 = mel2.to(self.precision)
|
1365 |
+
|
1366 |
+
# #### batched inference
|
1367 |
+
overlapped_len = 12
|
1368 |
+
feat_chunks = []
|
1369 |
+
feat_lens = []
|
1370 |
+
feat_list = []
|
1371 |
+
|
1372 |
+
for i, idx in enumerate(idx_list):
|
1373 |
+
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
|
1374 |
+
semantic_tokens = (
|
1375 |
+
semantic_tokens_list[i][-idx:].unsqueeze(0).unsqueeze(0)
|
1376 |
+
) # .unsqueeze(0)#mq要多unsqueeze一次
|
1377 |
+
feat, _ = self.vits_model.decode_encp(semantic_tokens, phones, refer_audio_spec, ge, speed)
|
1378 |
+
feat_list.append(feat)
|
1379 |
+
feat_lens.append(feat.shape[2])
|
1380 |
+
|
1381 |
+
feats = torch.cat(feat_list, 2)
|
1382 |
+
feats_padded = F.pad(feats, (overlapped_len, 0), "constant", 0)
|
1383 |
+
pos = 0
|
1384 |
+
padding_len = 0
|
1385 |
+
while True:
|
1386 |
+
if pos == 0:
|
1387 |
+
chunk = feats_padded[:, :, pos : pos + chunk_len]
|
1388 |
+
else:
|
1389 |
+
pos = pos - overlapped_len
|
1390 |
+
chunk = feats_padded[:, :, pos : pos + chunk_len]
|
1391 |
+
pos += chunk_len
|
1392 |
+
if chunk.shape[-1] == 0:
|
1393 |
+
break
|
1394 |
+
|
1395 |
+
# padding for the last chunk
|
1396 |
+
padding_len = chunk_len - chunk.shape[2]
|
1397 |
+
if padding_len != 0:
|
1398 |
+
chunk = F.pad(chunk, (0, padding_len), "constant", 0)
|
1399 |
+
feat_chunks.append(chunk)
|
1400 |
+
|
1401 |
+
feat_chunks = torch.cat(feat_chunks, 0)
|
1402 |
+
bs = feat_chunks.shape[0]
|
1403 |
+
fea_ref = fea_ref.repeat(bs, 1, 1)
|
1404 |
+
fea = torch.cat([fea_ref, feat_chunks], 2).transpose(2, 1)
|
1405 |
+
pred_spec = self.vits_model.cfm.inference(
|
1406 |
+
fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0
|
1407 |
+
)
|
1408 |
+
pred_spec = pred_spec[:, :, -chunk_len:]
|
1409 |
+
dd = pred_spec.shape[1]
|
1410 |
+
pred_spec = pred_spec.permute(1, 0, 2).contiguous().view(dd, -1).unsqueeze(0)
|
1411 |
+
# pred_spec = pred_spec[..., :-padding_len]
|
1412 |
+
|
1413 |
+
pred_spec = denorm_spec(pred_spec)
|
1414 |
+
|
1415 |
+
with torch.no_grad():
|
1416 |
+
wav_gen = self.bigvgan_model(pred_spec)
|
1417 |
+
audio = wav_gen[0][0] # .cpu().detach().numpy()
|
1418 |
+
|
1419 |
+
audio_fragments = []
|
1420 |
+
upsample_rate = 256
|
1421 |
+
pos = 0
|
1422 |
+
|
1423 |
+
while pos < audio.shape[-1]:
|
1424 |
+
audio_fragment = audio[pos : pos + chunk_len * upsample_rate]
|
1425 |
+
audio_fragments.append(audio_fragment)
|
1426 |
+
pos += chunk_len * upsample_rate
|
1427 |
+
|
1428 |
+
audio = self.sola_algorithm(audio_fragments, overlapped_len * upsample_rate)
|
1429 |
+
audio = audio[overlapped_len * upsample_rate : -padding_len * upsample_rate]
|
1430 |
+
|
1431 |
+
audio_fragments = []
|
1432 |
+
for feat_len in feat_lens:
|
1433 |
+
audio_fragment = audio[: feat_len * upsample_rate]
|
1434 |
+
audio_fragments.append(audio_fragment)
|
1435 |
+
audio = audio[feat_len * upsample_rate :]
|
1436 |
+
|
1437 |
+
return audio_fragments
|
1438 |
+
|
1439 |
+
def sola_algorithm(
|
1440 |
+
self,
|
1441 |
+
audio_fragments: List[torch.Tensor],
|
1442 |
+
overlap_len: int,
|
1443 |
+
):
|
1444 |
+
for i in range(len(audio_fragments) - 1):
|
1445 |
+
f1 = audio_fragments[i]
|
1446 |
+
f2 = audio_fragments[i + 1]
|
1447 |
+
w1 = f1[-overlap_len:]
|
1448 |
+
w2 = f2[:overlap_len]
|
1449 |
+
assert w1.shape == w2.shape
|
1450 |
+
corr = F.conv1d(w1.view(1, 1, -1), w2.view(1, 1, -1), padding=w2.shape[-1] // 2).view(-1)[:-1]
|
1451 |
+
idx = corr.argmax()
|
1452 |
+
f1_ = f1[: -(overlap_len - idx)]
|
1453 |
+
audio_fragments[i] = f1_
|
1454 |
+
|
1455 |
+
f2_ = f2[idx:]
|
1456 |
+
window = torch.hann_window((overlap_len - idx) * 2, device=f1.device, dtype=f1.dtype)
|
1457 |
+
f2_[: (overlap_len - idx)] = (
|
1458 |
+
window[: (overlap_len - idx)] * f2_[: (overlap_len - idx)]
|
1459 |
+
+ window[(overlap_len - idx) :] * f1[-(overlap_len - idx) :]
|
1460 |
+
)
|
1461 |
+
audio_fragments[i + 1] = f2_
|
1462 |
+
|
1463 |
+
return torch.cat(audio_fragments, 0)
|
GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import threading
|
4 |
+
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
+
now_dir = os.getcwd()
|
8 |
+
sys.path.append(now_dir)
|
9 |
+
|
10 |
+
import re
|
11 |
+
import torch
|
12 |
+
from text.LangSegmenter import LangSegmenter
|
13 |
+
from text import chinese
|
14 |
+
from typing import Dict, List, Tuple
|
15 |
+
from text.cleaner import clean_text
|
16 |
+
from text import cleaned_text_to_sequence
|
17 |
+
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
18 |
+
from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method
|
19 |
+
|
20 |
+
from tools.i18n.i18n import I18nAuto, scan_language_list
|
21 |
+
|
22 |
+
language = os.environ.get("language", "Auto")
|
23 |
+
language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
24 |
+
i18n = I18nAuto(language=language)
|
25 |
+
punctuation = set(["!", "?", "…", ",", ".", "-"])
|
26 |
+
|
27 |
+
|
28 |
+
def get_first(text: str) -> str:
|
29 |
+
pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]"
|
30 |
+
text = re.split(pattern, text)[0].strip()
|
31 |
+
return text
|
32 |
+
|
33 |
+
|
34 |
+
def merge_short_text_in_array(texts: str, threshold: int) -> list:
|
35 |
+
if (len(texts)) < 2:
|
36 |
+
return texts
|
37 |
+
result = []
|
38 |
+
text = ""
|
39 |
+
for ele in texts:
|
40 |
+
text += ele
|
41 |
+
if len(text) >= threshold:
|
42 |
+
result.append(text)
|
43 |
+
text = ""
|
44 |
+
if len(text) > 0:
|
45 |
+
if len(result) == 0:
|
46 |
+
result.append(text)
|
47 |
+
else:
|
48 |
+
result[len(result) - 1] += text
|
49 |
+
return result
|
50 |
+
|
51 |
+
|
52 |
+
class TextPreprocessor:
|
53 |
+
def __init__(self, bert_model: AutoModelForMaskedLM, tokenizer: AutoTokenizer, device: torch.device):
|
54 |
+
self.bert_model = bert_model
|
55 |
+
self.tokenizer = tokenizer
|
56 |
+
self.device = device
|
57 |
+
self.bert_lock = threading.RLock()
|
58 |
+
|
59 |
+
def preprocess(self, text: str, lang: str, text_split_method: str, version: str = "v2") -> List[Dict]:
|
60 |
+
print(f"############ {i18n('切分文本')} ############")
|
61 |
+
text = self.replace_consecutive_punctuation(text)
|
62 |
+
texts = self.pre_seg_text(text, lang, text_split_method)
|
63 |
+
result = []
|
64 |
+
print(f"############ {i18n('提取文本Bert特征')} ############")
|
65 |
+
for text in tqdm(texts):
|
66 |
+
phones, bert_features, norm_text = self.segment_and_extract_feature_for_text(text, lang, version)
|
67 |
+
if phones is None or norm_text == "":
|
68 |
+
continue
|
69 |
+
res = {
|
70 |
+
"phones": phones,
|
71 |
+
"bert_features": bert_features,
|
72 |
+
"norm_text": norm_text,
|
73 |
+
}
|
74 |
+
result.append(res)
|
75 |
+
return result
|
76 |
+
|
77 |
+
def pre_seg_text(self, text: str, lang: str, text_split_method: str):
|
78 |
+
text = text.strip("\n")
|
79 |
+
if len(text) == 0:
|
80 |
+
return []
|
81 |
+
if text[0] not in splits and len(get_first(text)) < 4:
|
82 |
+
text = "。" + text if lang != "en" else "." + text
|
83 |
+
print(i18n("实际输入的目标文本:"))
|
84 |
+
print(text)
|
85 |
+
|
86 |
+
seg_method = get_seg_method(text_split_method)
|
87 |
+
text = seg_method(text)
|
88 |
+
|
89 |
+
while "\n\n" in text:
|
90 |
+
text = text.replace("\n\n", "\n")
|
91 |
+
|
92 |
+
_texts = text.split("\n")
|
93 |
+
_texts = self.filter_text(_texts)
|
94 |
+
_texts = merge_short_text_in_array(_texts, 5)
|
95 |
+
texts = []
|
96 |
+
|
97 |
+
for text in _texts:
|
98 |
+
# 解决输入目标文本的空行导致报错的问题
|
99 |
+
if len(text.strip()) == 0:
|
100 |
+
continue
|
101 |
+
if not re.sub("\W+", "", text):
|
102 |
+
# 检测一下,如果是纯符号,就跳过。
|
103 |
+
continue
|
104 |
+
if text[-1] not in splits:
|
105 |
+
text += "。" if lang != "en" else "."
|
106 |
+
|
107 |
+
# 解决句子过长导致Bert报错的问题
|
108 |
+
if len(text) > 510:
|
109 |
+
texts.extend(split_big_text(text))
|
110 |
+
else:
|
111 |
+
texts.append(text)
|
112 |
+
|
113 |
+
print(i18n("实际输入的目标文本(切句后):"))
|
114 |
+
print(texts)
|
115 |
+
return texts
|
116 |
+
|
117 |
+
def segment_and_extract_feature_for_text(
|
118 |
+
self, text: str, language: str, version: str = "v1"
|
119 |
+
) -> Tuple[list, torch.Tensor, str]:
|
120 |
+
return self.get_phones_and_bert(text, language, version)
|
121 |
+
|
122 |
+
def get_phones_and_bert(self, text: str, language: str, version: str, final: bool = False):
|
123 |
+
with self.bert_lock:
|
124 |
+
if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
|
125 |
+
# language = language.replace("all_","")
|
126 |
+
formattext = text
|
127 |
+
while " " in formattext:
|
128 |
+
formattext = formattext.replace(" ", " ")
|
129 |
+
if language == "all_zh":
|
130 |
+
if re.search(r"[A-Za-z]", formattext):
|
131 |
+
formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
|
132 |
+
formattext = chinese.mix_text_normalize(formattext)
|
133 |
+
return self.get_phones_and_bert(formattext, "zh", version)
|
134 |
+
else:
|
135 |
+
phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
|
136 |
+
bert = self.get_bert_feature(norm_text, word2ph).to(self.device)
|
137 |
+
elif language == "all_yue" and re.search(r"[A-Za-z]", formattext):
|
138 |
+
formattext = re.sub(r"[a-z]", lambda x: x.group(0).upper(), formattext)
|
139 |
+
formattext = chinese.mix_text_normalize(formattext)
|
140 |
+
return self.get_phones_and_bert(formattext, "yue", version)
|
141 |
+
else:
|
142 |
+
phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
|
143 |
+
bert = torch.zeros(
|
144 |
+
(1024, len(phones)),
|
145 |
+
dtype=torch.float32,
|
146 |
+
).to(self.device)
|
147 |
+
elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}:
|
148 |
+
textlist = []
|
149 |
+
langlist = []
|
150 |
+
if language == "auto":
|
151 |
+
for tmp in LangSegmenter.getTexts(text):
|
152 |
+
langlist.append(tmp["lang"])
|
153 |
+
textlist.append(tmp["text"])
|
154 |
+
elif language == "auto_yue":
|
155 |
+
for tmp in LangSegmenter.getTexts(text):
|
156 |
+
if tmp["lang"] == "zh":
|
157 |
+
tmp["lang"] = "yue"
|
158 |
+
langlist.append(tmp["lang"])
|
159 |
+
textlist.append(tmp["text"])
|
160 |
+
else:
|
161 |
+
for tmp in LangSegmenter.getTexts(text):
|
162 |
+
if tmp["lang"] == "en":
|
163 |
+
langlist.append(tmp["lang"])
|
164 |
+
else:
|
165 |
+
# 因无法区别中日韩文汉字,以用户输入为准
|
166 |
+
langlist.append(language)
|
167 |
+
textlist.append(tmp["text"])
|
168 |
+
# print(textlist)
|
169 |
+
# print(langlist)
|
170 |
+
phones_list = []
|
171 |
+
bert_list = []
|
172 |
+
norm_text_list = []
|
173 |
+
for i in range(len(textlist)):
|
174 |
+
lang = langlist[i]
|
175 |
+
phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang, version)
|
176 |
+
bert = self.get_bert_inf(phones, word2ph, norm_text, lang)
|
177 |
+
phones_list.append(phones)
|
178 |
+
norm_text_list.append(norm_text)
|
179 |
+
bert_list.append(bert)
|
180 |
+
bert = torch.cat(bert_list, dim=1)
|
181 |
+
phones = sum(phones_list, [])
|
182 |
+
norm_text = "".join(norm_text_list)
|
183 |
+
|
184 |
+
if not final and len(phones) < 6:
|
185 |
+
return self.get_phones_and_bert("." + text, language, version, final=True)
|
186 |
+
|
187 |
+
return phones, bert, norm_text
|
188 |
+
|
189 |
+
def get_bert_feature(self, text: str, word2ph: list) -> torch.Tensor:
|
190 |
+
with torch.no_grad():
|
191 |
+
inputs = self.tokenizer(text, return_tensors="pt")
|
192 |
+
for i in inputs:
|
193 |
+
inputs[i] = inputs[i].to(self.device)
|
194 |
+
res = self.bert_model(**inputs, output_hidden_states=True)
|
195 |
+
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
|
196 |
+
assert len(word2ph) == len(text)
|
197 |
+
phone_level_feature = []
|
198 |
+
for i in range(len(word2ph)):
|
199 |
+
repeat_feature = res[i].repeat(word2ph[i], 1)
|
200 |
+
phone_level_feature.append(repeat_feature)
|
201 |
+
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
202 |
+
return phone_level_feature.T
|
203 |
+
|
204 |
+
def clean_text_inf(self, text: str, language: str, version: str = "v2"):
|
205 |
+
language = language.replace("all_", "")
|
206 |
+
phones, word2ph, norm_text = clean_text(text, language, version)
|
207 |
+
phones = cleaned_text_to_sequence(phones, version)
|
208 |
+
return phones, word2ph, norm_text
|
209 |
+
|
210 |
+
def get_bert_inf(self, phones: list, word2ph: list, norm_text: str, language: str):
|
211 |
+
language = language.replace("all_", "")
|
212 |
+
if language == "zh":
|
213 |
+
feature = self.get_bert_feature(norm_text, word2ph).to(self.device)
|
214 |
+
else:
|
215 |
+
feature = torch.zeros(
|
216 |
+
(1024, len(phones)),
|
217 |
+
dtype=torch.float32,
|
218 |
+
).to(self.device)
|
219 |
+
|
220 |
+
return feature
|
221 |
+
|
222 |
+
def filter_text(self, texts):
|
223 |
+
_text = []
|
224 |
+
if all(text in [None, " ", "\n", ""] for text in texts):
|
225 |
+
raise ValueError(i18n("请输入有效文本"))
|
226 |
+
for text in texts:
|
227 |
+
if text in [None, " ", ""]:
|
228 |
+
pass
|
229 |
+
else:
|
230 |
+
_text.append(text)
|
231 |
+
return _text
|
232 |
+
|
233 |
+
def replace_consecutive_punctuation(self, text):
|
234 |
+
punctuations = "".join(re.escape(p) for p in punctuation)
|
235 |
+
pattern = f"([{punctuations}])([{punctuations}])+"
|
236 |
+
result = re.sub(pattern, r"\1", text)
|
237 |
+
return result
|
GPT_SoVITS/TTS_infer_pack/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from . import TTS, text_segmentation_method
|
GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from typing import Callable
|
3 |
+
|
4 |
+
punctuation = set(["!", "?", "…", ",", ".", "-", " "])
|
5 |
+
METHODS = dict()
|
6 |
+
|
7 |
+
|
8 |
+
def get_method(name: str) -> Callable:
|
9 |
+
method = METHODS.get(name, None)
|
10 |
+
if method is None:
|
11 |
+
raise ValueError(f"Method {name} not found")
|
12 |
+
return method
|
13 |
+
|
14 |
+
|
15 |
+
def get_method_names() -> list:
|
16 |
+
return list(METHODS.keys())
|
17 |
+
|
18 |
+
|
19 |
+
def register_method(name):
|
20 |
+
def decorator(func):
|
21 |
+
METHODS[name] = func
|
22 |
+
return func
|
23 |
+
|
24 |
+
return decorator
|
25 |
+
|
26 |
+
|
27 |
+
splits = {
|
28 |
+
",",
|
29 |
+
"。",
|
30 |
+
"?",
|
31 |
+
"!",
|
32 |
+
",",
|
33 |
+
".",
|
34 |
+
"?",
|
35 |
+
"!",
|
36 |
+
"~",
|
37 |
+
":",
|
38 |
+
":",
|
39 |
+
"—",
|
40 |
+
"…",
|
41 |
+
}
|
42 |
+
|
43 |
+
|
44 |
+
def split_big_text(text, max_len=510):
|
45 |
+
# 定义全角和半角标点符号
|
46 |
+
punctuation = "".join(splits)
|
47 |
+
|
48 |
+
# 切割文本
|
49 |
+
segments = re.split("([" + punctuation + "])", text)
|
50 |
+
|
51 |
+
# 初始化结果列表和当前片段
|
52 |
+
result = []
|
53 |
+
current_segment = ""
|
54 |
+
|
55 |
+
for segment in segments:
|
56 |
+
# 如果当前片段加上新的片段长度超过max_len,就将当前片段加入结果列表,并重置当前片段
|
57 |
+
if len(current_segment + segment) > max_len:
|
58 |
+
result.append(current_segment)
|
59 |
+
current_segment = segment
|
60 |
+
else:
|
61 |
+
current_segment += segment
|
62 |
+
|
63 |
+
# 将最后一个片段加入结果列表
|
64 |
+
if current_segment:
|
65 |
+
result.append(current_segment)
|
66 |
+
|
67 |
+
return result
|
68 |
+
|
69 |
+
|
70 |
+
def split(todo_text):
|
71 |
+
todo_text = todo_text.replace("……", "。").replace("——", ",")
|
72 |
+
if todo_text[-1] not in splits:
|
73 |
+
todo_text += "。"
|
74 |
+
i_split_head = i_split_tail = 0
|
75 |
+
len_text = len(todo_text)
|
76 |
+
todo_texts = []
|
77 |
+
while 1:
|
78 |
+
if i_split_head >= len_text:
|
79 |
+
break # 结尾一定有标点,所以直接跳出即可,最后一段在上次已加入
|
80 |
+
if todo_text[i_split_head] in splits:
|
81 |
+
i_split_head += 1
|
82 |
+
todo_texts.append(todo_text[i_split_tail:i_split_head])
|
83 |
+
i_split_tail = i_split_head
|
84 |
+
else:
|
85 |
+
i_split_head += 1
|
86 |
+
return todo_texts
|
87 |
+
|
88 |
+
|
89 |
+
# 不切
|
90 |
+
@register_method("cut0")
|
91 |
+
def cut0(inp):
|
92 |
+
if not set(inp).issubset(punctuation):
|
93 |
+
return inp
|
94 |
+
else:
|
95 |
+
return "/n"
|
96 |
+
|
97 |
+
|
98 |
+
# 凑四句一切
|
99 |
+
@register_method("cut1")
|
100 |
+
def cut1(inp):
|
101 |
+
inp = inp.strip("\n")
|
102 |
+
inps = split(inp)
|
103 |
+
split_idx = list(range(0, len(inps), 4))
|
104 |
+
split_idx[-1] = None
|
105 |
+
if len(split_idx) > 1:
|
106 |
+
opts = []
|
107 |
+
for idx in range(len(split_idx) - 1):
|
108 |
+
opts.append("".join(inps[split_idx[idx] : split_idx[idx + 1]]))
|
109 |
+
else:
|
110 |
+
opts = [inp]
|
111 |
+
opts = [item for item in opts if not set(item).issubset(punctuation)]
|
112 |
+
return "\n".join(opts)
|
113 |
+
|
114 |
+
|
115 |
+
# 凑50字一切
|
116 |
+
@register_method("cut2")
|
117 |
+
def cut2(inp):
|
118 |
+
inp = inp.strip("\n")
|
119 |
+
inps = split(inp)
|
120 |
+
if len(inps) < 2:
|
121 |
+
return inp
|
122 |
+
opts = []
|
123 |
+
summ = 0
|
124 |
+
tmp_str = ""
|
125 |
+
for i in range(len(inps)):
|
126 |
+
summ += len(inps[i])
|
127 |
+
tmp_str += inps[i]
|
128 |
+
if summ > 50:
|
129 |
+
summ = 0
|
130 |
+
opts.append(tmp_str)
|
131 |
+
tmp_str = ""
|
132 |
+
if tmp_str != "":
|
133 |
+
opts.append(tmp_str)
|
134 |
+
# print(opts)
|
135 |
+
if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了,和前一个合一起
|
136 |
+
opts[-2] = opts[-2] + opts[-1]
|
137 |
+
opts = opts[:-1]
|
138 |
+
opts = [item for item in opts if not set(item).issubset(punctuation)]
|
139 |
+
return "\n".join(opts)
|
140 |
+
|
141 |
+
|
142 |
+
# 按中文句号。切
|
143 |
+
@register_method("cut3")
|
144 |
+
def cut3(inp):
|
145 |
+
inp = inp.strip("\n")
|
146 |
+
opts = ["%s" % item for item in inp.strip("。").split("。")]
|
147 |
+
opts = [item for item in opts if not set(item).issubset(punctuation)]
|
148 |
+
return "\n".join(opts)
|
149 |
+
|
150 |
+
|
151 |
+
# 按英文句号.切
|
152 |
+
@register_method("cut4")
|
153 |
+
def cut4(inp):
|
154 |
+
inp = inp.strip("\n")
|
155 |
+
opts = re.split(r"(?<!\d)\.(?!\d)", inp.strip("."))
|
156 |
+
opts = [item for item in opts if not set(item).issubset(punctuation)]
|
157 |
+
return "\n".join(opts)
|
158 |
+
|
159 |
+
|
160 |
+
# 按标点符号切
|
161 |
+
# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
|
162 |
+
@register_method("cut5")
|
163 |
+
def cut5(inp):
|
164 |
+
inp = inp.strip("\n")
|
165 |
+
punds = {",", ".", ";", "?", "!", "、", ",", "。", "?", "!", ";", ":", "…"}
|
166 |
+
mergeitems = []
|
167 |
+
items = []
|
168 |
+
|
169 |
+
for i, char in enumerate(inp):
|
170 |
+
if char in punds:
|
171 |
+
if char == "." and i > 0 and i < len(inp) - 1 and inp[i - 1].isdigit() and inp[i + 1].isdigit():
|
172 |
+
items.append(char)
|
173 |
+
else:
|
174 |
+
items.append(char)
|
175 |
+
mergeitems.append("".join(items))
|
176 |
+
items = []
|
177 |
+
else:
|
178 |
+
items.append(char)
|
179 |
+
|
180 |
+
if items:
|
181 |
+
mergeitems.append("".join(items))
|
182 |
+
|
183 |
+
opt = [item for item in mergeitems if not set(item).issubset(punds)]
|
184 |
+
return "\n".join(opt)
|
185 |
+
|
186 |
+
|
187 |
+
if __name__ == "__main__":
|
188 |
+
method = get_method("cut5")
|
189 |
+
print(method("你好,我是小明。你好,我是小红。你好,我是小刚。你好,我是小张。"))
|
GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:24164f129c66499d1346e2aa55f183250c223161ec2770c0da3d3b08cf432d3c
|
3 |
+
size 188811417
|
GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e53a693acc59ace251d143d068096ae0d7b79e4b1b503fa84c9dcf576448c1d8
|
3 |
+
size 651225145
|
GPT_SoVITS/pretrained_models/fast_langdetect/lid.176.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e69ec5451bc261cc7844e49e4792a85d7f09c06789ec800fc4a44aec362764e
|
3 |
+
size 131266198
|
GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:732f94e63b148066e24c7f9d2637f3374083e637635f07fbdb695dee20ddbe1f
|
3 |
+
size 155315150
|
GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2D2333k.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8ae7fe8dd8c8f2e718de359e00edac88b0c71ab2fd10b07ad4cc45070eb8a836
|
3 |
+
size 93534164
|
GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:924fdccaa3c574bf139c25c9759aa1ed3b3f99e19a7c529ee996c2bc17663695
|
3 |
+
size 106035259
|
GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:906fe22f48c3e037a389df291d4d32a9414e15dbb8f9628643e83aaced109ea4
|
3 |
+
size 769025545
|
GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4d611913df7b12d49e8976c944558d2d096816365edfc6c35a9e85b67dd14ed9
|
3 |
+
size 57781109
|
GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x/bigvgan_generator.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ee5e2f9cd60b51db75e1806f4fe7621733757586c541c78cb3dd369d5ba24476
|
3 |
+
size 225179685
|
GPT_SoVITS/pretrained_models/s1v3.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:87133414860ea14ff6620c483a3db5ed07b44be42e2c3fcdad65523a729a745a
|
3 |
+
size 155284856
|
GPT_SoVITS/pretrained_models/s2G488k.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:020a014e1e01e550e510f2f61fae5e5f5b6aab40f15c22f1f12f724df507e835
|
3 |
+
size 105973721
|
GPT_SoVITS/pretrained_models/s2Gv3.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f33abb1920076d988e1711d5f41b5c9c6d7f92575b4acf0ad4fae6a4ebf0cf19
|
3 |
+
size 769035145
|
GPT_SoVITS/text/G2PWModel/g2pW.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2eb3c71fd95117b2e1abef8d2d0cd78aae894bbe7f0fac105ddc9c32ce63cbd0
|
3 |
+
size 635212732
|
GPT_SoVITS/text/engdict_cache.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9bff9393f4b192d873a11335efc8f124771087b6dc847d34fd240c2846889d2b
|
3 |
+
size 5965909
|
GPT_SoVITS/text/g2pw/polyphonic.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f425246160a32c578557cd3151cd0bb97f5f44c3aaf65e718dd2c3213c04fb4b
|
3 |
+
size 1322387
|
GPT_SoVITS/text/ja_userdic/userdict.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d857e443ee48d9641096816a98996669602895411e4330d7d91d1dbe1103389f
|
3 |
+
size 17180971
|
GPT_SoVITS/text/namedict_cache.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:559552094c4a6e995213e3fa586330e078ef8cb3a7a95a3109e945111cd2bfc1
|
3 |
+
size 760663
|
docs/cn/Changelog_CN.md
ADDED
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### 20240121更新
|
2 |
+
|
3 |
+
1-config添加is_share, 诸如colab等场景可以将此改为True, 来使得webui映射到公网
|
4 |
+
|
5 |
+
2-WebUI添加英文系统英文翻译适配
|
6 |
+
|
7 |
+
3-cmd-asr自动判断是否已自带damo模型, 如不在默认目录上将从modelscope自带下载
|
8 |
+
|
9 |
+
4-[SoVITS训练报错ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 尝试修复(过滤长度0的样本等)
|
10 |
+
|
11 |
+
5-清理TEMP文件夹缓存音频等文件
|
12 |
+
|
13 |
+
6-大幅削弱合成音频包含参考音频结尾的问题
|
14 |
+
|
15 |
+
### 20240122更新
|
16 |
+
|
17 |
+
1-修复过短输出文件返回重复参考音频的问题.
|
18 |
+
|
19 |
+
2-经测试, 英文日文训练原生支持(日文训练需要根目录不含非英文等特殊字符).
|
20 |
+
|
21 |
+
3-音频路径检查.如果尝试读取输入错的路径报错路径不存在, 而非ffmpeg错误.
|
22 |
+
|
23 |
+
### 20240123更新
|
24 |
+
|
25 |
+
1-解决hubert提取nan导致SoVITS/GPT训练报错ZeroDivisionError的问题
|
26 |
+
|
27 |
+
2-支持推理界面快速切换模型
|
28 |
+
|
29 |
+
3-优化模型文件排序逻辑
|
30 |
+
|
31 |
+
4-中文分词使用jieba_fast代替jieba
|
32 |
+
|
33 |
+
### 20240126更新
|
34 |
+
|
35 |
+
1-支持输出文本中英混合、日英混合
|
36 |
+
|
37 |
+
2-输出可选切分模式
|
38 |
+
|
39 |
+
3-修复uvr5读取到目录自动跳出的问题
|
40 |
+
|
41 |
+
4-修复多个换行导致推理报错
|
42 |
+
|
43 |
+
5-去除推理界面大量冗余log
|
44 |
+
|
45 |
+
6-支持mac训练推理
|
46 |
+
|
47 |
+
7-自动识别不支持半精度的卡强制单精度.cpu推理下强制单精度.
|
48 |
+
|
49 |
+
### 20240128更新
|
50 |
+
|
51 |
+
1-修复数字转汉字念法问题
|
52 |
+
|
53 |
+
2-修复句首少量字容易吞字的问题
|
54 |
+
|
55 |
+
3-通过限制排除不合理的参考音频长度
|
56 |
+
|
57 |
+
4-修复GPT训练不保存ckpt的问题
|
58 |
+
|
59 |
+
5-完善Dockerfile的下载模型流程
|
60 |
+
|
61 |
+
### 20240129更新
|
62 |
+
|
63 |
+
1-16系等半精度训练有问题的显卡把训练配置改为单精度训练
|
64 |
+
|
65 |
+
2-测试更新可用的colab版本
|
66 |
+
|
67 |
+
3-修复git clone modelscope funasr仓库+老版本funasr导致接口不对齐报错的问题
|
68 |
+
|
69 |
+
|
70 |
+
### 20240130更新
|
71 |
+
|
72 |
+
1-所有涉及路径的地方双引号自动去除,小白复制路径带双引号不会报错
|
73 |
+
|
74 |
+
2-修复中英文标点切割问题和句首句尾补标点的问题
|
75 |
+
|
76 |
+
3-增加按标点符号切分
|
77 |
+
|
78 |
+
### 20240201更新
|
79 |
+
|
80 |
+
1-修复uvr5读取格式错误导致分离失败的问题
|
81 |
+
|
82 |
+
2-支持中日英混合多种文本自动切分识别语种
|
83 |
+
|
84 |
+
### 20240202更新
|
85 |
+
|
86 |
+
1-修复asr路径尾缀带/保存文件名报错
|
87 |
+
|
88 |
+
2-引入paddlespeech的Normalizer https://github.com/RVC-Boss/GPT-SoVITS/pull/377 修复一些问题, 例如: xx.xx%(带百分号类), 元/吨 会读成 元吨 而不是元每吨,下划线不再会报错
|
89 |
+
|
90 |
+
### 20240207更新
|
91 |
+
|
92 |
+
1-修正语种传参混乱导致中文推理效果下降 https://github.com/RVC-Boss/GPT-SoVITS/issues/391
|
93 |
+
|
94 |
+
2-uvr5适配高版本librosa https://github.com/RVC-Boss/GPT-SoVITS/pull/403
|
95 |
+
|
96 |
+
3-[修复uvr5 inf everywhere报错的问题(is_half传参未转换bool导致恒定半精度推理, 16系显卡会inf)](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8)
|
97 |
+
|
98 |
+
4-优化英文文本前端
|
99 |
+
|
100 |
+
5-修复gradio依赖
|
101 |
+
|
102 |
+
6-支持三连根目录留空自动读取.list全路径
|
103 |
+
|
104 |
+
7-集成faster whisper ASR日文英文
|
105 |
+
|
106 |
+
### 20240208更新
|
107 |
+
|
108 |
+
1-GPT训练卡死 (win10 1909) 和https://github.com/RVC-Boss/GPT-SoVITS/issues/232 (系统语言繁体) GPT训练报错, [尝试修复](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b).
|
109 |
+
|
110 |
+
### 20240212更新
|
111 |
+
|
112 |
+
1-faster whisper和funasr逻辑优化.faster whisper转镜像站下载, 规避huggingface连不上的问题.
|
113 |
+
|
114 |
+
2-DPO Loss实验性训练选项开启, 通过构造负样本训练缓解GPT重复漏字问题.推理界面公开几个推理参数. https://github.com/RVC-Boss/GPT-SoVITS/pull/457
|
115 |
+
|
116 |
+
### 20240214更新
|
117 |
+
|
118 |
+
1-训练支持中文实验名 (原来会报错)
|
119 |
+
|
120 |
+
2-DPO训练改为可勾选选项而非必须.如勾选batch size自动减半.修复推理界面新参数不传参的问题.
|
121 |
+
|
122 |
+
### 20240216更新
|
123 |
+
|
124 |
+
1-支持无参考文本输入
|
125 |
+
|
126 |
+
2-修复中文文本前端bug https://github.com/RVC-Boss/GPT-SoVITS/issues/475
|
127 |
+
|
128 |
+
### 20240221更新
|
129 |
+
|
130 |
+
1-数据处理添加语音降噪选项 (降噪为只剩16k采样率, 除非底噪很大先不急着用哦).
|
131 |
+
|
132 |
+
2-中文日文前端处理优化 https://github.com/RVC-Boss/GPT-SoVITS/pull/559 https://github.com/RVC-Boss/GPT-SoVITS/pull/556 https://github.com/RVC-Boss/GPT-SoVITS/pull/532 https://github.com/RVC-Boss/GPT-SoVITS/pull/507 https://github.com/RVC-Boss/GPT-SoVITS/pull/509
|
133 |
+
|
134 |
+
3-mac CPU推理更快因此把推理设备从mps改到CPU
|
135 |
+
|
136 |
+
4-colab修复不开启公网url
|
137 |
+
|
138 |
+
### 20240306更新
|
139 |
+
|
140 |
+
1-推理加速50% (RTX3090+pytorch2.2.1+cu11.8+win10+py39 tested) https://github.com/RVC-Boss/GPT-SoVITS/pull/672
|
141 |
+
|
142 |
+
2-如果用faster whisper非中文ASR不再需要先下中文funasr模型
|
143 |
+
|
144 |
+
3-修复uvr5去混响模型 是否混响 反的 https://github.com/RVC-Boss/GPT-SoVITS/pull/610
|
145 |
+
|
146 |
+
4-faster whisper如果无cuda可用自动cpu推理 https://github.com/RVC-Boss/GPT-SoVITS/pull/675
|
147 |
+
|
148 |
+
5-修改is_half的判断使在Mac上能正常CPU推理 https://github.com/RVC-Boss/GPT-SoVITS/pull/573
|
149 |
+
|
150 |
+
### 202403/202404/202405更新
|
151 |
+
|
152 |
+
2个重点
|
153 |
+
|
154 |
+
1-修复sovits训练未冻结vq的问题 (可能造成效果下降)
|
155 |
+
|
156 |
+
2-增加一个快速推理分支
|
157 |
+
|
158 |
+
以下都是小修补
|
159 |
+
|
160 |
+
1-修复无参考文本模式问题
|
161 |
+
|
162 |
+
2-优化中英文文本前端
|
163 |
+
|
164 |
+
3-api格式优化
|
165 |
+
|
166 |
+
4-cmd格式问题修复
|
167 |
+
|
168 |
+
5-训练数据处理阶段不支持的语言提示报错
|
169 |
+
|
170 |
+
6-nan自动转fp32阶段的hubert提取bug修复
|
171 |
+
|
172 |
+
### 20240610
|
173 |
+
|
174 |
+
小问题修复:
|
175 |
+
|
176 |
+
1-完善纯标点、多标点文本输入的判断逻辑 https://github.com/RVC-Boss/GPT-SoVITS/pull/1168 https://github.com/RVC-Boss/GPT-SoVITS/pull/1169
|
177 |
+
|
178 |
+
2-uvr5中的mdxnet去混响cmd格式修复, 兼容路径带空格 [#501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232)
|
179 |
+
|
180 |
+
3-s2训练进度条逻辑修复 https://github.com/RVC-Boss/GPT-SoVITS/pull/1159
|
181 |
+
|
182 |
+
大问题修复:
|
183 |
+
|
184 |
+
4-修复了webui的GPT中文微调没读到bert导致和推理不一致, 训练太多可能效果还会变差的问题.如果大量数据微调的建议重新微调模型得到质量优化 [#99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a)
|
185 |
+
|
186 |
+
### 20240706
|
187 |
+
|
188 |
+
小问题修复:
|
189 |
+
|
190 |
+
1-[修正CPU推理默认bs小数](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041)
|
191 |
+
|
192 |
+
2-修复降噪、asr中途遇到异常跳出所有需处理的音频文件的问题 https://github.com/RVC-Boss/GPT-SoVITS/pull/1258 https://github.com/RVC-Boss/GPT-SoVITS/pull/1265 https://github.com/RVC-Boss/GPT-SoVITS/pull/1267
|
193 |
+
|
194 |
+
3-修复按标点符号切分时小数会被切分 https://github.com/RVC-Boss/GPT-SoVITS/pull/1253
|
195 |
+
|
196 |
+
4-[多卡训练多进程保存逻辑修复](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca)
|
197 |
+
|
198 |
+
5-移除冗余my_utils https://github.com/RVC-Boss/GPT-SoVITS/pull/1251
|
199 |
+
|
200 |
+
重点:
|
201 |
+
|
202 |
+
6-倍速推理代码经过验证后推理效果和base完全一致, 合并进main.使用的代码: https://github.com/RVC-Boss/GPT-SoVITS/pull/672 .支持无参考文本模式也倍速.
|
203 |
+
|
204 |
+
后面会逐渐验证快速推理分支的推理改动的一致性
|
205 |
+
|
206 |
+
### 20240727
|
207 |
+
|
208 |
+
1-清理冗余i18n代码 https://github.com/RVC-Boss/GPT-SoVITS/pull/1298
|
209 |
+
|
210 |
+
2-修复用户打文件及路径在结尾添加/会导致命令行报错的问题 https://github.com/RVC-Boss/GPT-SoVITS/pull/1299
|
211 |
+
|
212 |
+
3-修复GPT训练的step计算逻辑 https://github.com/RVC-Boss/GPT-SoVITS/pull/756
|
213 |
+
|
214 |
+
重点:
|
215 |
+
|
216 |
+
4-[支持合成语速调节.支持冻结随机性只调节语速, ](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2)并将其更新到api.py上https://github.com/RVC-Boss/GPT-SoVITS/pull/1340
|
217 |
+
|
218 |
+
|
219 |
+
### 20240806
|
220 |
+
|
221 |
+
1-增加bs-roformer人声伴奏分离模型支持. https://github.com/RVC-Boss/GPT-SoVITS/pull/1306 https://github.com/RVC-Boss/GPT-SoVITS/pull/1356 [支持fp16推理.](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c)
|
222 |
+
|
223 |
+
2-更好的中文文本前端. https://github.com/RVC-Boss/GPT-SoVITS/pull/987 https://github.com/RVC-Boss/GPT-SoVITS/pull/1351 https://github.com/RVC-Boss/GPT-SoVITS/pull/1404 优化多音字逻辑 (v2版本特供). https://github.com/RVC-Boss/GPT-SoVITS/pull/488
|
224 |
+
|
225 |
+
3-自动填充下一步的文件路径 https://github.com/RVC-Boss/GPT-SoVITS/pull/1355
|
226 |
+
|
227 |
+
4-增加喂饭逻辑, 用户瞎写显卡序号也可以正常运作 [bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299) [4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78)
|
228 |
+
|
229 |
+
5-增加粤语ASR支持 [8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3)
|
230 |
+
|
231 |
+
6-GPT-SoVITS-v2支持
|
232 |
+
|
233 |
+
7-计时逻辑优化 https://github.com/RVC-Boss/GPT-SoVITS/pull/1387
|
234 |
+
|
235 |
+
### 20240821
|
236 |
+
|
237 |
+
1-fast_inference分支合并进main: https://github.com/RVC-Boss/GPT-SoVITS/pull/1490
|
238 |
+
|
239 |
+
2-支持通过ssml标签优化数字、电话、时间日期等: https://github.com/RVC-Boss/GPT-SoVITS/issues/1508
|
240 |
+
|
241 |
+
3-api修复优化: https://github.com/RVC-Boss/GPT-SoVITS/pull/1503
|
242 |
+
|
243 |
+
4-修复了参考音频混合只能上传一条的bug:https://github.com/RVC-Boss/GPT-SoVITS/pull/1422
|
244 |
+
|
245 |
+
5-增加了各种数据集检查,若缺失会弹出warning:https://github.com/RVC-Boss/GPT-SoVITS/pull/1422
|
246 |
+
|
247 |
+
### 20250211
|
248 |
+
|
249 |
+
增加gpt-sovits-v3模型, 需要14G显存可以微调
|
250 |
+
|
251 |
+
### 20250212
|
252 |
+
|
253 |
+
sovits-v3微调支持开启梯度检查点, 需要12G显存可以微调https://github.com/RVC-Boss/GPT-SoVITS/pull/2040
|
254 |
+
|
255 |
+
### 20250214
|
256 |
+
|
257 |
+
优化多语种混合文本切分策略a https://github.com/RVC-Boss/GPT-SoVITS/pull/2047
|
258 |
+
|
259 |
+
### 20250217
|
260 |
+
|
261 |
+
优化文本里的数字和英文处理逻辑https://github.com/RVC-Boss/GPT-SoVITS/pull/2062
|
262 |
+
|
263 |
+
### 20250218
|
264 |
+
|
265 |
+
优化多语种混合文本切分策略b https://github.com/RVC-Boss/GPT-SoVITS/pull/2073
|
266 |
+
|
267 |
+
### 20250223
|
268 |
+
|
269 |
+
1-sovits-v3微调支持lora训练, 需要8G显存可以微调, 效果比全参微调更好
|
270 |
+
|
271 |
+
2-人声背景音分离增加mel band roformer模型支持https://github.com/RVC-Boss/GPT-SoVITS/pull/2078
|
272 |
+
|
273 |
+
### 20250226
|
274 |
+
|
275 |
+
https://github.com/RVC-Boss/GPT-SoVITS/pull/2112 https://github.com/RVC-Boss/GPT-SoVITS/pull/2114
|
276 |
+
|
277 |
+
修复中文路径下mecab的报错 (具体表现为日文韩文、文本混合语种切分可能会遇到的报错)
|
278 |
+
|
279 |
+
### 20250227
|
280 |
+
|
281 |
+
针对v3生成24k音频感觉闷的问题https://github.com/RVC-Boss/GPT-SoVITS/issues/2085 https://github.com/RVC-Boss/GPT-SoVITS/issues/2117 ,支持使用24k to 48k的音频超分模型缓解.
|
282 |
+
|
283 |
+
|
284 |
+
### 20250228
|
285 |
+
|
286 |
+
修复短文本语种选择出错 https://github.com/RVC-Boss/GPT-SoVITS/pull/2122
|
287 |
+
|
288 |
+
修复v3sovits未传参以支持调节语速
|
289 |
+
|
290 |
+
### 202503
|
291 |
+
|
292 |
+
修复一批由依赖的库版本不对导致的问题https://github.com/RVC-Boss/GPT-SoVITS/commit/6c468583c5566e5fbb4fb805e4cc89c403e997b8
|
293 |
+
|
294 |
+
修复模型加载异步逻辑https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
|
295 |
+
|
296 |
+
修复其他若干bug
|
297 |
+
|
298 |
+
重点更新:
|
299 |
+
|
300 |
+
1-v3支持并行推理 https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
|
301 |
+
|
302 |
+
2-整合包修复onnxruntime GPU推理的支持, 影响: (1) g2pw有个onnx模型原先是CPU推理现在用GPU, 显著降低推理的CPU瓶颈 (2) foxjoy去混响模型现在可使用GPU推理
|
docs/cn/README.md
ADDED
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<div align="center">
|
2 |
+
|
3 |
+
<h1>GPT-SoVITS-WebUI</h1>
|
4 |
+
强大的少样本语音转换与语音合成Web用户界面.<br><br>
|
5 |
+
|
6 |
+
[](https://github.com/RVC-Boss/GPT-SoVITS)
|
7 |
+
|
8 |
+
<a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
9 |
+
|
10 |
+
<!-- img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br> -->
|
11 |
+
|
12 |
+
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
|
13 |
+
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
14 |
+
[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
|
15 |
+
[](https://discord.gg/dnrgs5GHfG)
|
16 |
+
|
17 |
+
[**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
|
18 |
+
|
19 |
+
</div>
|
20 |
+
|
21 |
+
---
|
22 |
+
|
23 |
+
## 功能:
|
24 |
+
|
25 |
+
1. **零样本文本到语音 (TTS): ** 输入 5 秒的声音样本, 即刻体验文本到语音转换.
|
26 |
+
|
27 |
+
2. **少样本 TTS: ** 仅需 1 分钟的训练数据即可微调模型, 提升声音相似度和真实感.
|
28 |
+
|
29 |
+
3. **跨语言支持: ** 支持与训练数据集不同语言的推理, 目前支持英语、日语、韩语、粤语和中文.
|
30 |
+
|
31 |
+
4. **WebUI 工具: ** 集成工具包括声音伴奏分离、自动训练集分割、中文自动语音识别(ASR)和文本标注, 协助初学者创建训练数据集和 GPT/SoVITS 模型.
|
32 |
+
|
33 |
+
**查看我们的介绍视频 [demo video](https://www.bilibili.com/video/BV12g4y1m7Uw)**
|
34 |
+
|
35 |
+
未见过的说话者 few-shot 微调演示:
|
36 |
+
|
37 |
+
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
|
38 |
+
|
39 |
+
**用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
40 |
+
|
41 |
+
## 安装
|
42 |
+
|
43 |
+
中国地区的用户可[点击此处](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official)使用 AutoDL 云端镜像进行体验.
|
44 |
+
|
45 |
+
### 测试通过的环境
|
46 |
+
|
47 |
+
| Python Version | PyTorch Version | Device |
|
48 |
+
|----------------|------------------|-----------------|
|
49 |
+
| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 |
|
50 |
+
| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 |
|
51 |
+
| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 |
|
52 |
+
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
53 |
+
| Python 3.11 | PyTorch 2.6.0 | Apple silicon |
|
54 |
+
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
55 |
+
|
56 |
+
### Windows
|
57 |
+
|
58 |
+
如果你是 Windows 用户 (已在 win>=10 上测试), 可以下载[整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true), 解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI.
|
59 |
+
|
60 |
+
**中国地区的用户可以[在此处下载整合包](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).**
|
61 |
+
|
62 |
+
### Linux
|
63 |
+
|
64 |
+
```bash
|
65 |
+
conda create -n GPTSoVits python=3.9
|
66 |
+
conda activate GPTSoVits
|
67 |
+
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
68 |
+
```
|
69 |
+
|
70 |
+
### macOS
|
71 |
+
|
72 |
+
**注: 在 Mac 上使用 GPU 训练的模型效果显著低于其他设备训练的模型, 所以我们暂时使用 CPU 进行训练.**
|
73 |
+
|
74 |
+
1. 运行 `xcode-select --install` 安装 Xcode command-line tools.
|
75 |
+
2. 运行以下的命令来安装本项目:
|
76 |
+
|
77 |
+
```bash
|
78 |
+
conda create -n GPTSoVits python=3.9
|
79 |
+
conda activate GPTSoVits
|
80 |
+
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
81 |
+
```
|
82 |
+
|
83 |
+
### 手动安装
|
84 |
+
|
85 |
+
#### 安装 FFmpeg
|
86 |
+
|
87 |
+
##### Conda 用户
|
88 |
+
|
89 |
+
```bash
|
90 |
+
conda install ffmpeg
|
91 |
+
```
|
92 |
+
|
93 |
+
##### Ubuntu/Debian 用户
|
94 |
+
|
95 |
+
```bash
|
96 |
+
sudo apt install ffmpeg
|
97 |
+
sudo apt install libsox-dev
|
98 |
+
conda install -c conda-forge 'ffmpeg<7'
|
99 |
+
```
|
100 |
+
|
101 |
+
##### Windows 用户
|
102 |
+
|
103 |
+
下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下.
|
104 |
+
|
105 |
+
安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS)
|
106 |
+
|
107 |
+
##### MacOS 用户
|
108 |
+
|
109 |
+
```bash
|
110 |
+
brew install ffmpeg
|
111 |
+
```
|
112 |
+
|
113 |
+
#### 安装依赖
|
114 |
+
|
115 |
+
```bash
|
116 |
+
pip install -r extra-req.txt --no-deps
|
117 |
+
pip install -r requirements.txt
|
118 |
+
```
|
119 |
+
|
120 |
+
### 在 Docker 中使用
|
121 |
+
|
122 |
+
#### docker-compose.yaml 设置
|
123 |
+
|
124 |
+
0. image 的标签: 由于代码库更新很快, 镜像的打包和测试又很慢, 所以请自行在 [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(旧版本) 查看当前打包好的最新的镜像并根据自己的情况选用, 或者在本地根据您自己的需求通过 Dockerfile 进行构建.
|
125 |
+
1. 环境变量:
|
126 |
+
|
127 |
+
- is_half: 半精度/双精度控制.在进行 "SSL extracting" 步骤时如果无法正确生成 4-cnhubert/5-wav32k 目录下的内容时, 一般都是它引起的, 可以根据实际情况来调整为 True 或者 False.
|
128 |
+
|
129 |
+
2. Volume 设置, 容器内的应用根目录设置为 /workspace. 默认的 docker-compose.yaml 中列出了一些实际的例子, 便于上传/下载内容.
|
130 |
+
3. shm_size: Windows 下的 Docker Desktop 默认可用内存过小, 会导致运行异常, 根据自己情况酌情设置.
|
131 |
+
4. deploy 小节下的 gpu 相关内容, 请根据您的系统和实际情况酌情设置.
|
132 |
+
|
133 |
+
#### 通过 docker compose 运行
|
134 |
+
|
135 |
+
```
|
136 |
+
docker compose -f "docker-compose.yaml" up -d
|
137 |
+
```
|
138 |
+
|
139 |
+
#### 通过 docker 命令运行
|
140 |
+
|
141 |
+
同上, 根据您自己的实际情况修改对应的参数, 然后运行如下命令:
|
142 |
+
|
143 |
+
```
|
144 |
+
docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx
|
145 |
+
```
|
146 |
+
|
147 |
+
## 预训练模型
|
148 |
+
|
149 |
+
**若成功运行`install.sh`可跳过 No.1,2,3**
|
150 |
+
|
151 |
+
**中国地区的用户可以[在此处下载这些模型](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).**
|
152 |
+
|
153 |
+
1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型, 并将其放置在 `GPT_SoVITS/pretrained_models` 目录中.
|
154 |
+
|
155 |
+
2. 从 [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 下载模型, 解压并重命名为 `G2PWModel`, 然后将其放置在 `GPT_SoVITS/text` 目录中. (仅限中文 TTS)
|
156 |
+
|
157 |
+
3. 对于 UVR5 (人声/伴奏分离和混响移除, 额外功能), 从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型, 并将其放置在 `tools/uvr5/uvr5_weights` 目录中.
|
158 |
+
|
159 |
+
- 如果你在 UVR5 中使用 `bs_roformer` 或 `mel_band_roformer`模型, 你可以手动下载模型和相应的配置文件, 并将它们放在 `tools/UVR5/UVR5_weights` 中.**重命名模型文件和配置文件, 确保除后缀外**, 模型和配置文件具有相同且对应的名称.此外, 模型和配置文件名**必须包含"roformer"**, 才能被识别为 roformer 类的模型.
|
160 |
+
|
161 |
+
- 建议在模型名称和配置文件名中**直接指定模型类型**, 例如`mel_mand_roformer`、`bs_roformer`.如果未指定, 将从配置文中比对特征, 以确定它是哪种类型的模型.例如, 模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对.`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对.
|
162 |
+
|
163 |
+
4. 对于中文 ASR (额外功能), 从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型, 并将它们放置在 `tools/asr/models` 目录中.
|
164 |
+
|
165 |
+
5. 对于英语或日语 ASR (额外功能), 从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型, 并将其放置在 `tools/asr/models` 目录中.此外, [其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间.
|
166 |
+
|
167 |
+
## 数据集格式
|
168 |
+
|
169 |
+
文本到语音 (TTS) 注释 .list 文件格式:
|
170 |
+
|
171 |
+
```
|
172 |
+
vocal_path|speaker_name|language|text
|
173 |
+
```
|
174 |
+
|
175 |
+
语言字典:
|
176 |
+
|
177 |
+
- 'zh': 中文
|
178 |
+
- 'ja': 日语
|
179 |
+
- 'en': 英语
|
180 |
+
- 'ko': 韩语
|
181 |
+
- 'yue': 粤语
|
182 |
+
|
183 |
+
示例:
|
184 |
+
|
185 |
+
```
|
186 |
+
D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神.
|
187 |
+
```
|
188 |
+
|
189 |
+
## 微调与推理
|
190 |
+
|
191 |
+
### 打开 WebUI
|
192 |
+
|
193 |
+
#### 整合包用户
|
194 |
+
|
195 |
+
双击`go-webui.bat`或者使用`go-webui.ps1`
|
196 |
+
若想使用 V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1`
|
197 |
+
|
198 |
+
#### 其他
|
199 |
+
|
200 |
+
```bash
|
201 |
+
python webui.py <language(optional)>
|
202 |
+
```
|
203 |
+
|
204 |
+
若想使用 V1,则
|
205 |
+
|
206 |
+
```bash
|
207 |
+
python webui.py v1 <language(optional)>
|
208 |
+
```
|
209 |
+
|
210 |
+
或者在 webUI 内动态切换
|
211 |
+
|
212 |
+
### 微调
|
213 |
+
|
214 |
+
#### 现已支持自动填充路径
|
215 |
+
|
216 |
+
1. 填入训练音频路径
|
217 |
+
2. 切割音频
|
218 |
+
3. 进行降噪(可选)
|
219 |
+
4. 进行ASR
|
220 |
+
5. 校对标注
|
221 |
+
6. 前往下一个窗口,点击训练
|
222 |
+
|
223 |
+
### 打开推理 WebUI
|
224 |
+
|
225 |
+
#### 整合包用户
|
226 |
+
|
227 |
+
双击 `go-webui.bat` 或者使用 `go-webui.ps1` ,然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
|
228 |
+
|
229 |
+
#### 其他
|
230 |
+
|
231 |
+
```bash
|
232 |
+
python GPT_SoVITS/inference_webui.py <language(optional)>
|
233 |
+
```
|
234 |
+
|
235 |
+
或者
|
236 |
+
|
237 |
+
```bash
|
238 |
+
python webui.py
|
239 |
+
```
|
240 |
+
|
241 |
+
然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
|
242 |
+
|
243 |
+
## V2 发布说明
|
244 |
+
|
245 |
+
新特性:
|
246 |
+
|
247 |
+
1. 支持韩语及粤语
|
248 |
+
|
249 |
+
2. 更好的文本前端
|
250 |
+
|
251 |
+
3. 底模由 2k 小时扩展至 5k 小时
|
252 |
+
|
253 |
+
4. 对低音质参考音频 (尤其是来源于网络的高频严重缺失、听着很闷的音频) 合成出来音质更好
|
254 |
+
|
255 |
+
详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
256 |
+
|
257 |
+
从 v1 环境迁移至 v2
|
258 |
+
|
259 |
+
1. 需要 pip 安装 requirements.txt 更新环境
|
260 |
+
|
261 |
+
2. 需要克隆 github 上的最新代码
|
262 |
+
|
263 |
+
3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS\pretrained_models\gsv-v2final-pretrained 下
|
264 |
+
|
265 |
+
中文额外需要下载[G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下)
|
266 |
+
|
267 |
+
## V3 更新说明
|
268 |
+
|
269 |
+
新模型特点:
|
270 |
+
|
271 |
+
1. 音色相似度更像, 需要更少训练集来逼近本人 (不训练直接使用底模模式下音色相似性提升更大)
|
272 |
+
|
273 |
+
2. GPT 合成更稳定, 重复漏字更少, 也更容易跑出丰富情感
|
274 |
+
|
275 |
+
详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
276 |
+
|
277 |
+
从 v2 环境迁移至 v3
|
278 |
+
|
279 |
+
1. 需要 pip 安装 requirements.txt 更新环境
|
280 |
+
|
281 |
+
2. 需要克隆 github 上的最新代码
|
282 |
+
|
283 |
+
3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下
|
284 |
+
|
285 |
+
如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题, 需要下载额外的模型参数, 参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
|
286 |
+
|
287 |
+
## 待办事项清单
|
288 |
+
|
289 |
+
- [x] **高优先级: **
|
290 |
+
|
291 |
+
- [x] 日语和英语的本地化.
|
292 |
+
- [x] 用户指南.
|
293 |
+
- [x] 日语和英语数据集微调训练.
|
294 |
+
|
295 |
+
- [ ] **功能:**
|
296 |
+
- [x] 零样本声音转换 (5 秒) / 少样本声音转换 (1 分钟).
|
297 |
+
- [x] TTS 语速控制.
|
298 |
+
- [ ] ~~增强的 TTS 情感控制.~~
|
299 |
+
- [ ] 尝试将 SoVITS 令牌输入更改为词汇的概率分布.
|
300 |
+
- [x] 改进英语和日语文本前端.
|
301 |
+
- [ ] 开发体积小和更大的 TTS 模型.
|
302 |
+
- [x] Colab 脚本.
|
303 |
+
- [x] 扩展训练数据集 (从 2k 小时到 10k 小时).
|
304 |
+
- [x] 更好的 sovits 基础模型 (增强的音频质量).
|
305 |
+
- [ ] 模型混合.
|
306 |
+
|
307 |
+
## (附加) 命令行运行方式
|
308 |
+
|
309 |
+
使用命令行打开 UVR5 的 WebUI
|
310 |
+
|
311 |
+
```
|
312 |
+
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
313 |
+
```
|
314 |
+
|
315 |
+
<!-- 如果打不开浏览器, 请按照下面的格式进行UVR处理, 这是使用mdxnet进行音频处理的方式
|
316 |
+
````
|
317 |
+
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
318 |
+
```` -->
|
319 |
+
|
320 |
+
这是使用命令行完成数据集的音频切分的方式
|
321 |
+
|
322 |
+
```
|
323 |
+
python audio_slicer.py \
|
324 |
+
--input_path "<path_to_original_audio_file_or_directory>" \
|
325 |
+
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
|
326 |
+
--threshold <volume_threshold> \
|
327 |
+
--min_length <minimum_duration_of_each_subclip> \
|
328 |
+
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
329 |
+
--hop_size <step_size_for_computing_volume_curve>
|
330 |
+
```
|
331 |
+
|
332 |
+
这是使用命令行完成数据集 ASR 处理的方式 (仅限中文)
|
333 |
+
|
334 |
+
```
|
335 |
+
python tools/asr/funasr_asr.py -i <input> -o <output>
|
336 |
+
```
|
337 |
+
|
338 |
+
通过 Faster_Whisper 进行 ASR 处理 (除中文之外的 ASR 标记)
|
339 |
+
|
340 |
+
(没有进度条, GPU 性能可能会导致时间延迟)
|
341 |
+
|
342 |
+
```
|
343 |
+
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
344 |
+
```
|
345 |
+
|
346 |
+
启用自定义列表保存路径
|
347 |
+
|
348 |
+
## 致谢
|
349 |
+
|
350 |
+
特别感谢以下项目和贡献者:
|
351 |
+
|
352 |
+
### 理论研究
|
353 |
+
|
354 |
+
- [ar-vits](https://github.com/innnky/ar-vits)
|
355 |
+
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
356 |
+
- [vits](https://github.com/jaywalnut310/vits)
|
357 |
+
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
|
358 |
+
- [contentvec](https://github.com/auspicious3000/contentvec/)
|
359 |
+
- [hifi-gan](https://github.com/jik876/hifi-gan)
|
360 |
+
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
361 |
+
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
362 |
+
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
363 |
+
|
364 |
+
### 预训练模型
|
365 |
+
|
366 |
+
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
367 |
+
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
368 |
+
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
369 |
+
|
370 |
+
### 推理用文本前端
|
371 |
+
|
372 |
+
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
373 |
+
- [split-lang](https://github.com/DoodleBears/split-lang)
|
374 |
+
- [g2pW](https://github.com/GitYCC/g2pW)
|
375 |
+
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
376 |
+
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
377 |
+
|
378 |
+
### WebUI 工具
|
379 |
+
|
380 |
+
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
381 |
+
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
382 |
+
- [SubFix](https://github.com/cronrpc/SubFix)
|
383 |
+
- [FFmpeg](https://github.com/FFmpeg/FFmpeg)
|
384 |
+
- [gradio](https://github.com/gradio-app/gradio)
|
385 |
+
- [faster-whisper](https://github.com/SYSTRAN/faster-whisper)
|
386 |
+
- [FunASR](https://github.com/alibaba-damo-academy/FunASR)
|
387 |
+
- [AP-BWE](https://github.com/yxlu-0102/AP-BWE)
|
388 |
+
|
389 |
+
感谢 @Naozumi520 提供粤语训练集, 并在粤语相关知识方面给予指导.
|
390 |
+
|
391 |
+
## 感谢所有贡献者的努力
|
392 |
+
|
393 |
+
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
|
394 |
+
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
|
395 |
+
</a>
|
docs/en/Changelog_EN.md
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### 20240121 Update
|
2 |
+
|
3 |
+
1. Added `is_share` to the `config`. In scenarios like Colab, this can be set to `True` to map the WebUI to the public network.
|
4 |
+
2. Added English system translation support to WebUI.
|
5 |
+
3. The `cmd-asr` automatically detects if the FunASR model is included; if not found in the default directory, it will be downloaded from ModelScope.
|
6 |
+
4. Attempted to fix the SoVITS training ZeroDivisionError reported in [Issue 79](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) by filtering samples with zero length, etc.
|
7 |
+
5. Cleaned up cached audio files and other files in the `TEMP` folder.
|
8 |
+
6. Significantly reduced the issue of synthesized audio containing the end of the reference audio.
|
9 |
+
|
10 |
+
### 20240122 Update
|
11 |
+
|
12 |
+
1. Fixed the issue where excessively short output files resulted in repeating the reference audio.
|
13 |
+
2. Tested native support for English and Japanese training (Japanese training requires the root directory to be free of non-English special characters).
|
14 |
+
3. Improved audio path checking. If an attempt is made to read from an incorrect input path, it will report that the path does not exist instead of an ffmpeg error.
|
15 |
+
|
16 |
+
### 20240123 Update
|
17 |
+
|
18 |
+
1. Resolved the issue where Hubert extraction caused NaN errors, leading to SoVITS/GPT training ZeroDivisionError.
|
19 |
+
2. Added support for quick model switching in the inference WebUI.
|
20 |
+
3. Optimized the model file sorting logic.
|
21 |
+
4. Replaced `jieba` with `jieba_fast` for Chinese word segmentation.
|
22 |
+
|
23 |
+
### 20240126 Update
|
24 |
+
|
25 |
+
1. Added support for Chinese-English mixed and Japanese-English mixed output texts.
|
26 |
+
2. Added an optional segmentation mode for output.
|
27 |
+
3. Fixed the issue of UVR5 reading and automatically jumping out of directories.
|
28 |
+
4. Fixed multiple newline issues causing inference errors.
|
29 |
+
5. Removed redundant logs in the inference WebUI.
|
30 |
+
6. Supported training and inference on Mac.
|
31 |
+
7. Automatically forced single precision for GPU that do not support half precision; enforced single precision under CPU inference.
|
32 |
+
|
33 |
+
### 20240128 Update
|
34 |
+
|
35 |
+
1. Fixed the issue with the pronunciation of numbers converting to Chinese characters.
|
36 |
+
2. Fixed the issue of swallowing a few characters at the beginning of sentences.
|
37 |
+
3. Excluded unreasonable reference audio lengths by setting restrictions.
|
38 |
+
4. Fixed the issue where GPT training did not save checkpoints.
|
39 |
+
5. Completed model downloading process in the Dockerfile.
|
40 |
+
|
41 |
+
### 20240129 Update
|
42 |
+
|
43 |
+
1. Changed training configurations to single precision for GPUs like the 16 series, which have issues with half precision training.
|
44 |
+
2. Tested and updated the available Colab version.
|
45 |
+
3. Fixed the issue of git cloning the ModelScope FunASR repository with older versions of FunASR causing interface misalignment errors.
|
46 |
+
|
47 |
+
### 20240130 Update
|
48 |
+
|
49 |
+
1. Automatically removed double quotes from all path-related entries to prevent errors from novice users copying paths with double quotes.
|
50 |
+
2. Fixed issues with splitting Chinese and English punctuation and added punctuation at the beginning and end of sentences.
|
51 |
+
3. Added splitting by punctuation.
|
52 |
+
|
53 |
+
### 20240201 Update
|
54 |
+
|
55 |
+
1. Fixed the UVR5 format reading error causing separation failures.
|
56 |
+
2. Supported automatic segmentation and language recognition for mixed Chinese-Japanese-English texts.
|
57 |
+
|
58 |
+
### 20240202 Update
|
59 |
+
|
60 |
+
1. Fixed the issue where an ASR path ending with `/` caused an error in saving the filename.
|
61 |
+
2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) introduced PaddleSpeech's Normalizer to fix issues like reading "xx.xx%" (percent symbols) and "元/吨" being read as "元吨" instead of "元每吨", and fixed underscore errors.
|
62 |
+
|
63 |
+
### 20240207 Update
|
64 |
+
|
65 |
+
1. Corrected language parameter confusion causing decreased Chinese inference quality reported in [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391).
|
66 |
+
2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) adapted UVR5 to higher versions of librosa.
|
67 |
+
3. [Commit 14a2851](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8) fixed UVR5 inf everywhere error caused by `is_half` parameter not converting to boolean, resulting in constant half precision inference, which caused `inf` on 16 series GPUs.
|
68 |
+
4. Optimized English text frontend.
|
69 |
+
5. Fixed Gradio dependencies.
|
70 |
+
6. Supported automatic reading of `.list` full paths if the root directory is left blank during dataset preparation.
|
71 |
+
7. Integrated Faster Whisper ASR for Japanese and English.
|
72 |
+
|
73 |
+
### 20240208 Update
|
74 |
+
|
75 |
+
1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b) attempted to fix GPT training hang on Windows 10 1909 and [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232) (Traditional Chinese System Language).
|
76 |
+
|
77 |
+
### 20240212 Update
|
78 |
+
|
79 |
+
1. Optimized logic for Faster Whisper and FunASR, switching Faster Whisper to mirror downloads to avoid issues with Hugging Face connections.
|
80 |
+
2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457) enabled experimental DPO Loss training option to mitigate GPT repetition and missing characters by constructing negative samples during training and made several inference parameters available in the inference WebUI.
|
81 |
+
|
82 |
+
### 20240214 Update
|
83 |
+
|
84 |
+
1. Supported Chinese experiment names in training (previously caused errors).
|
85 |
+
2. Made DPO training an optional feature instead of mandatory. If selected, the batch size is automatically halved. Fixed issues with new parameters not being passed in the inference WebUI.
|
86 |
+
|
87 |
+
### 20240216 Update
|
88 |
+
|
89 |
+
1. Supported input without reference text.
|
90 |
+
2. Fixed bugs in Chinese frontend reported in [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475).
|
91 |
+
|
92 |
+
### 20240221 Update
|
93 |
+
|
94 |
+
1. Added a noise reduction option during data processing (noise reduction leaves only 16kHz sampling rate; use only if the background noise is significant).
|
95 |
+
2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) optimized Chinese and Japanese frontend processing.
|
96 |
+
3. Switched Mac CPU inference to use CPU instead of MPS for faster performance.
|
97 |
+
4. Fixed Colab public URL issue.
|
98 |
+
|
99 |
+
### 20240306 Update
|
100 |
+
|
101 |
+
1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) accelerated inference by 50% (tested on RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39) .
|
102 |
+
2. No longer requires downloading the Chinese FunASR model first when using Faster Whisper non-Chinese ASR.
|
103 |
+
3. [PR 610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610) fixed UVR5 reverb removal model where the setting was reversed.
|
104 |
+
4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) enabled automatic CPU inference for Faster Whisper if no CUDA is available.
|
105 |
+
5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573) modified `is_half` check to ensure proper CPU inference on Mac.
|
106 |
+
|
107 |
+
### 202403/202404/202405 Update
|
108 |
+
|
109 |
+
#### Minor Fixes:
|
110 |
+
|
111 |
+
1. Fixed issues with the no-reference text mode.
|
112 |
+
2. Optimized the Chinese and English text frontend.
|
113 |
+
3. Improved API format.
|
114 |
+
4. Fixed CMD format issues.
|
115 |
+
5. Added error prompts for unsupported languages during training data processing.
|
116 |
+
6. Fixed the bug in Hubert extraction.
|
117 |
+
|
118 |
+
#### Major Fixes:
|
119 |
+
|
120 |
+
1. Fixed the issue of SoVITS training without freezing VQ (which could cause quality degradation).
|
121 |
+
2. Added a quick inference branch.
|
122 |
+
|
123 |
+
### 20240610 Update
|
124 |
+
|
125 |
+
#### Minor Fixes:
|
126 |
+
|
127 |
+
1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169) improved the logic for pure punctuation and multi-punctuation text input.
|
128 |
+
2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) fixed CMD format for MDXNet de-reverb in UVR5, supporting paths with spaces.
|
129 |
+
3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159) fixed progress bar logic for SoVITS training in `s2_train.py`.
|
130 |
+
|
131 |
+
#### Major Fixes:
|
132 |
+
|
133 |
+
4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) fixed the issue of WebUI's GPT fine-tuning not reading BERT feature of Chinese input texts, causing inconsistency with inference and potential quality degradation.
|
134 |
+
**Caution: If you have previously fine-tuned with a large amount of data, it is recommended to retune the model to improve quality.**
|
135 |
+
|
136 |
+
### 20240706 Update
|
137 |
+
|
138 |
+
#### Minor Fixes:
|
139 |
+
|
140 |
+
1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) fixed default batch size decimal issue in CPU inference.
|
141 |
+
2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) fixed issues where denoising or ASR encountering exceptions would exit all pending audio files.
|
142 |
+
3. [PR 1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) fixed the issue of splitting decimals when splitting by punctuation.
|
143 |
+
4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) fixed multi-process save logic for multi-GPU training.
|
144 |
+
5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) removed redundant `my_utils`.
|
145 |
+
|
146 |
+
#### Major Fixes:
|
147 |
+
|
148 |
+
6. The accelerated inference code from [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) has been validated and merged into the main branch, ensuring consistent inference effects with the base.
|
149 |
+
It also supports accelerated inference in no-reference text mode.
|
150 |
+
|
151 |
+
**Future updates will continue to verify the consistency of changes in the `fast_inference` branch**.
|
152 |
+
|
153 |
+
### 20240727 Update
|
154 |
+
|
155 |
+
#### Minor Fixes:
|
156 |
+
|
157 |
+
1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) cleaned up redundant i18n code.
|
158 |
+
2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) fixed issues where trailing slashes in user file paths caused command line errors.
|
159 |
+
3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) fixed the step calculation logic in GPT training.
|
160 |
+
|
161 |
+
#### Major Fixes:
|
162 |
+
|
163 |
+
4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) supported speech rate adjustment for synthesis.
|
164 |
+
Enabled freezing randomness while only adjusting the speech rate.
|
165 |
+
|
166 |
+
### 20240806 Update
|
167 |
+
|
168 |
+
1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) Added support for the BS RoFormer vocal accompaniment separation model. [Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) Enabled FP16 inference.
|
169 |
+
2. Improved Chinese text frontend.
|
170 |
+
- [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) added support for polyphonic characters (v2 only);
|
171 |
+
- [PR 987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987) added quantifier;
|
172 |
+
- [PR 1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) supports arithmetic and basic math formulas;
|
173 |
+
- [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) fixed mixed text errors.
|
174 |
+
3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) automatically filled in the paths when processing audio in the WebUI.
|
175 |
+
4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) optimized GPU recognition logic.
|
176 |
+
5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) added support for Cantonese ASR.
|
177 |
+
6. Added support for GPT-SoVITS v2.
|
178 |
+
7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) optimized timing logic.
|
179 |
+
|
180 |
+
### 20240821 Update
|
181 |
+
|
182 |
+
1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) Merge the `fast_inference` branch into the main branch.
|
183 |
+
2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) Support for optimizing numbers, phone numbers, dates, and times using SSML tags.
|
184 |
+
3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) Fixed and optimized API.
|
185 |
+
4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) Fixed the bug where only one reference audio could be uploaded for mixing, Added various dataset checks with warnings popping up if missing files.
|
186 |
+
|
187 |
+
### 20250211 Update
|
188 |
+
|
189 |
+
- [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) Added GPT-SoVITS v3 Model, Need 14GB GPU Memory to Fine-tune SoVITS v3.
|
190 |
+
|
191 |
+
### 20250212 Update
|
192 |
+
|
193 |
+
- [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) Added gradient checkpointing to Fine-tune SoVITS v3, Need 12GB GPU Memory.
|
194 |
+
|
195 |
+
### 20250214 Update
|
196 |
+
|
197 |
+
- [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) Optimize the multilingual mixed text segmentation strategy **A**.
|
198 |
+
-AAdded `split-lang` as a language segmentation tool to improve segmentation capabilities for multi-language mixed text.
|
199 |
+
|
200 |
+
### 20250217 Update
|
201 |
+
|
202 |
+
- [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) Optimize the logic for handling numbers and English in the text.
|
203 |
+
|
204 |
+
### 20250218 Update
|
205 |
+
|
206 |
+
- [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) Optimize the multilingual mixed text segmentation strategy **B**.
|
207 |
+
|
208 |
+
### 20250223 Update
|
209 |
+
|
210 |
+
1. LoRA training is supported for fine-tuning with SoVITS V3. It requires 8GB GPU Memory and the results are better than full parameter fine-tuning.
|
211 |
+
2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) Added Mel Band RoFormer model for Vocal & Instrument Separation.
|
212 |
+
|
213 |
+
### 20250226 Update
|
214 |
+
|
215 |
+
1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Fix issues caused by non-English directories in Windows.
|
216 |
+
- Using `langsegmenter` for Korean.
|
217 |
+
2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Fix issues caused by non-English directories in Windows.
|
218 |
+
- Using `langsegmenter` for Korean/Japanese.
|
219 |
+
|
220 |
+
### 20250227 Update
|
221 |
+
|
222 |
+
- Added 24K to 48K audio super-resolution models to alleviate the muffled issue when generating 24K audio with V3 model, as reported in [Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117).
|
docs/ja/Changelog_JA.md
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### 20240121 更新
|
2 |
+
|
3 |
+
1. `config`に`is_share`を追加し、Colab などの環境でこれを`True`に設定すると、webui を公共ネットワークにマッピングできます.
|
4 |
+
2. WebUI に英語システムの英語翻訳を追加しました.
|
5 |
+
3. `cmd-asr`は FunASR モデルが既に含まれているかどうかを自動的に確認し、デフォルトのパスにない場合は modelscope から自動的にダウンロードします.
|
6 |
+
4. [SoVITS 训练报错 ZeroDivisionError](https://github.com/RVC-Boss/GPT-SoVITS/issues/79) 修復を試みます (長さ 0 のサンプルをフィルタリングなど)
|
7 |
+
5. TEMP ファイルフォルダからオーディオやその他のファイルをクリーンアップして最適化します.
|
8 |
+
6. 合成オーディオがリファレンスオーディオの終わりを含む問題を大幅に改善しました.
|
9 |
+
|
10 |
+
### 20240122 更新
|
11 |
+
|
12 |
+
1. 短すぎる出力ファイルが重複したリファレンスオーディオを返す問題を修正しました.
|
13 |
+
2. 英語-日本語学習がスムーズに進む QA を完了しました. (ただし、日本語学習はルートディレクトリに英語以外の文字が含まれていない必要があります)
|
14 |
+
3. オーディオパスをチェックします.間違ったパスを読み取ろうとすると、「パスが存在しません」というエラーメッセージが返されます.これは ffmpeg モジュールのエラーではありません.
|
15 |
+
|
16 |
+
### 20240123 更新
|
17 |
+
|
18 |
+
1. hubert から nan 抽出による SoVITS/GPT 学習中の ZeroDivisionError 関連エラーを修正しました.
|
19 |
+
2. 推論インターフェースでモデルを素早く切り替えることができるようにサポートしました.
|
20 |
+
3. モデルファイルのソートロジックを最適化しました.
|
21 |
+
4. 中国語の分析に `jieba_fast` を `jieba` に置き換えました.
|
22 |
+
|
23 |
+
### 20240126 更新
|
24 |
+
|
25 |
+
1. 中国語と英語、日本語と英語が混在した出力テキストをサポートします.
|
26 |
+
2. 出力で選択的な分割モードをサポートします.
|
27 |
+
3. uvr5 がディレクトリを読み取り、自動的に終了する問題を修正しました.
|
28 |
+
4. 複数の改行による推論エラーを修正しました.
|
29 |
+
5. 推論インターフェースから不要なログを削除しました.
|
30 |
+
6. MacOS での学習と推論をサポートします.
|
31 |
+
7. 半精度をサポートしていないカードを自動的に識別して単精度を強制し、CPU 推論では単精度を強制します.
|
32 |
+
|
33 |
+
### 20240128 更新
|
34 |
+
|
35 |
+
1. 数字を漢字で読む問題を修正しました.
|
36 |
+
2. 文章の先頭の一部の単語が欠落する問題を修正しました.
|
37 |
+
3. 不適切な長さのリファレンスオーディオを制限しました.
|
38 |
+
4. GPT 学習時の ckpt が保存されない問題を修正しました.
|
39 |
+
5. Dockerfile のモデルダウンロードプロセスを改善しました.
|
40 |
+
|
41 |
+
### 20240129 更新
|
42 |
+
|
43 |
+
1. 16 系などの半精度学習に問題があるカードは、学習構成を単精度学習に変更しました.
|
44 |
+
2. Colab でも使用可能なバージョンをテストして更新しました.
|
45 |
+
3. ModelScope FunASR リポジトリの古いバージョンで git クローンを行う際のインターフェース不整合エラーの問題を修正しました.
|
46 |
+
|
47 |
+
### 20240130 更新
|
48 |
+
|
49 |
+
1. パスと関連する文字列を解析して、二重引用符を自動的に削除します.また、パスをコピーする場合、二重引用符が含まれていてもエラーが発生しません.
|
50 |
+
2. 中国語と英語、日本語と英語の混合出力をサポートします.
|
51 |
+
3. 出力で選択的な分割モードをサポートします.
|
52 |
+
|
53 |
+
### 20240201 更新
|
54 |
+
|
55 |
+
1. UVR5 形式の読み取りエラーによる分離失敗を修正しました.
|
56 |
+
2. 中国語・日本語・英語の混合テキストに対する自動分割と言語認識をサポートしました.
|
57 |
+
|
58 |
+
### 20240202 更新
|
59 |
+
|
60 |
+
1. ASRパスが `/` で終わることによるファイル名保存エラーの問題を修正しました.
|
61 |
+
2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) で PaddleSpeech の Normalizer を導入し、"xx.xx%" (パーセント記号) の読み取りや"元/吨"が"元吨"ではなく"元每吨"と読まれる問題、アンダースコアエラーを修正しました.
|
62 |
+
|
63 |
+
### 20240207 更新
|
64 |
+
|
65 |
+
1. [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391) で報告された中国語推論品質の低下を引き起こした言語パラメータの混乱を修正しました.
|
66 |
+
2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) で UVR5 を librosa のより高いバージョンに適応させました.
|
67 |
+
3. [Commit 14a2851](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8) で、`is_half` パラメータがブール値に変換されず、常に半精度推論が行われ、16 シリーズの GPU で `inf` が発生する UVR5 inf everywhereエラーを修正しました.
|
68 |
+
4. 英語テキストフロントエンドを最適化��ました.
|
69 |
+
5. Gradio の依存関係を修正しました.
|
70 |
+
6. データセット準備中にルートディレクトリが空白の場合、`.list` フルパスの自動読み取りをサポートしました.
|
71 |
+
7. 日本語と英語のために Faster Whisper ASR を統合しました.
|
72 |
+
|
73 |
+
### 20240208 更新
|
74 |
+
|
75 |
+
1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b) で、Windows 10 1909 および [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232) (繁体字中国語システム言語) での GPT トレーニングのハングを修正する試みを行いました.
|
76 |
+
|
77 |
+
### 20240212 更新
|
78 |
+
|
79 |
+
1. Faster Whisper と FunASR のロジックを最適化し、Faster Whisper をミラーダウンロードに切り替えて Hugging Face の接続問題を回避しました.
|
80 |
+
2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457) で、GPT の繰り返しと文字欠落を軽減するために、トレーニング中に負のサンプルを構築する実験的なDPO Lossトレーニングオプションを有効にし、いくつかの推論パラメータを推論WebUIで利用可能にしました.
|
81 |
+
|
82 |
+
### 20240214 更新
|
83 |
+
|
84 |
+
1. トレーニングで中国語の実験名をサポート (以前はエラーが発生していました).
|
85 |
+
2. DPOトレーニングを必須ではなくオプション機能に変更.選択された場合、バッチサイズは自動的に半分になります.推論 WebUI で新しいパラメータが渡されない問題を修正しました.
|
86 |
+
|
87 |
+
### 20240216 更新
|
88 |
+
|
89 |
+
1. 参照テキストなしでの入力をサポート.
|
90 |
+
2. [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475) で報告された中国語フロントエンドのバグを修正しました.
|
91 |
+
|
92 |
+
### 20240221 更新
|
93 |
+
|
94 |
+
1. データ処理中のノイズ低減オプションを追加 (ノイズ低減は16kHzサンプリングレートのみを残します;背景ノイズが大きい場合にのみ使用してください).
|
95 |
+
2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) で中国語と日本語のフロントエンド処理を最適化しました.
|
96 |
+
3. Mac CPU 推論を MPS ではなく CPU を使用するように切り替え、パフォーマンスを向上させました.
|
97 |
+
4. Colab のパブリック URL の問題を修正しました.
|
98 |
+
### 20240306 更新
|
99 |
+
|
100 |
+
1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) で推論速度を50%向上させました (RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39 でテスト).
|
101 |
+
2. Faster Whisper非中国語ASRを使用する際、最初に中国語FunASRモデルをダウンロードする必要がなくなりました.
|
102 |
+
3. [PR 610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610) で UVR5 残響除去モデルの設定が逆になっていた問題を修正しました.
|
103 |
+
4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) で、CUDA が利用できない場合に Faster Whisper の自動 CPU 推論を有効にしました.
|
104 |
+
5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573) で、Mac での適切なCPU推論を確保するために `is_half` チェックを修正しました.
|
105 |
+
|
106 |
+
### 202403/202404/202405 更新
|
107 |
+
|
108 |
+
#### マイナー修正:
|
109 |
+
|
110 |
+
1. 参照テキストなしモードの問題を修正しました.
|
111 |
+
2. 中国語と英語のテキストフロントエンドを最適化しました.
|
112 |
+
3. API フォーマットを改善しました.
|
113 |
+
4. CMD フォーマットの問題を修正しました.
|
114 |
+
5. トレーニングデータ処理中のサポートされていない言語に対するエラープロンプトを追加しました.
|
115 |
+
6. Hubert 抽出のバグを修正しました.
|
116 |
+
|
117 |
+
#### メジャー修正:
|
118 |
+
|
119 |
+
1. SoVITS トレーニングで VQ を凍結せずに品質低下を引き起こす問題を修正しました.
|
120 |
+
2. クイック推論ブランチを追加しました.
|
121 |
+
|
122 |
+
### 20240610 更新
|
123 |
+
|
124 |
+
#### マイナー修正:
|
125 |
+
|
126 |
+
1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169)で、純粋な句読点および複数の句読点を含むテキスト入力のロジックを改善しました.
|
127 |
+
2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232)で、UVR5 の MDXNet デリバブをサポートする CMD フォーマットを修正し、スペースを含むパスをサポートしました.
|
128 |
+
3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159)で、`s2_train.py` の SoVITS トレーニングのプログレスバーロジックを修正しました.
|
129 |
+
|
130 |
+
#### メジャー修正:
|
131 |
+
|
132 |
+
4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) で、WebUI の GPT ファインチューニングが中国語入力テキストの BERT 特徴を読み取らず、推論との不一致や品質低下の可能性を修正しました.
|
133 |
+
**注意: 以前に大量のデータでファインチューニングを行った場合、品質向上のためにモデルを再調整することをお勧めします.**
|
134 |
+
|
135 |
+
### 20240706 更新
|
136 |
+
|
137 |
+
#### マイナー修正:
|
138 |
+
|
139 |
+
1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) で、CPU 推論のデフォルトバッチサイズの小数点問題を修正しました.
|
140 |
+
2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) で、ノイズ除去またはASRが例外に遭遇した場合に、すべての保留中のオーディオファイルが終了する問題を修正しました.
|
141 |
+
3. [PR 1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) で、句読点で分割する際の小数点分割の問題を修正しました.
|
142 |
+
4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) で、マルチGPUトレーニングのマルチプロセス保存ロジックを修正しました.
|
143 |
+
5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) で、不要な `my_utils` を削除しました.
|
144 |
+
|
145 |
+
#### メジャー修正:
|
146 |
+
|
147 |
+
6. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) の加速推論コードが検証され、メインブランチにマージされ、ベースとの推論効果の一貫性が確保されました.
|
148 |
+
また、参照テキストなしモードでの加速推論もサポートしています.
|
149 |
+
|
150 |
+
**今後の更新では、`fast_inference`ブランチの変更の一貫性を継続的に検証します**.
|
151 |
+
|
152 |
+
### 20240727 更新
|
153 |
+
|
154 |
+
#### マイナー修正:
|
155 |
+
|
156 |
+
1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) で、不要な i18n コードをクリーンアップしました.
|
157 |
+
2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) で、ユーザーファイルパスの末尾のスラッシュがコマンドラインエラーを引き起こす問題を修正しました.
|
158 |
+
3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) で、GPT トレーニングのステップ計算ロジックを修正しました.
|
159 |
+
|
160 |
+
#### メジャー修正:
|
161 |
+
|
162 |
+
4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) で、合成のスピーチレート調整をサポートしました.
|
163 |
+
スピーチレートのみを調整しながらランダム性を固定できるようになりました.
|
164 |
+
|
165 |
+
### 20240806 更新
|
166 |
+
|
167 |
+
1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306)、[PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) BS RoFormer ボーカルアコムパニ分離モデルのサポートを追加しました.[Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) FP16 推論を有効にしました.
|
168 |
+
2. 中国語テキストフロントエンドを改善しました.
|
169 |
+
- [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) 多音字のサポートを追加 (v2 のみ);
|
170 |
+
- [PR 987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987) 量詞を追加;
|
171 |
+
- [PR 1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) 四則演算と基本数式のサポート;
|
172 |
+
- [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) 混合テキストエラーを修正.
|
173 |
+
3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) WebUIでオーディオ処理時にパスを自動入力しました.
|
174 |
+
4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) GPU 認識ロジックを最適化しました.
|
175 |
+
5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) 広東語ASRのサポートを追加しました.
|
176 |
+
6. GPT-SoVITS v2 のサポートを追加しました.
|
177 |
+
7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) タイミングロジックを最適化しました.
|
178 |
+
|
179 |
+
### 20240821 更新
|
180 |
+
|
181 |
+
1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) `fast_inference` ブランチをメインブランチにマージしました.
|
182 |
+
2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) SSMLタグを使用して数字、電話番号、日付、時間などの最適化をサポートしました.
|
183 |
+
3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) APIの修正と最適化を行いました.
|
184 |
+
4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) 参照音声のミキシングで1つしかアップロードできないバグを修正し、データセットの各種チェックを追加してファイルが欠落している場合に警告を表示するようにしました.
|
185 |
+
|
186 |
+
### 20250211 更新
|
187 |
+
|
188 |
+
1. [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) GPT-SoVITS v3 モデルを追加しました.SoVITS v3のファインチューニングには14GBのGPUメモリが必要です.
|
189 |
+
|
190 |
+
### 20250212 更新
|
191 |
+
|
192 |
+
- [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) SoVITS v3のファインチューニングにグラデーションチェックポイントを追加、12GBのGPUメモリが必要です.
|
193 |
+
|
194 |
+
### 20250214 更新
|
195 |
+
|
196 |
+
- [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) 多言語混合テキスト分割戦略の最適化 **A**.
|
197 |
+
- `split-lang`を言語分割ツールとして追加し、多言語混合テキストの分割能力を向上させました.
|
198 |
+
|
199 |
+
### 20250217 更新
|
200 |
+
|
201 |
+
- [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) テキスト内の数字と英語の処理ロジックを最適化.
|
202 |
+
|
203 |
+
### 20250218 更新
|
204 |
+
|
205 |
+
- [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) 多言語混合テキスト分割戦略の最適化 **B**.
|
206 |
+
|
207 |
+
### 20250223 更新
|
208 |
+
|
209 |
+
1. LoRAトレーニングがSoVITS V3のファインチューニングに対応しました.8GBのGPUメモリが必要で、結果はフルパラメータファインチューニングより優れています.
|
210 |
+
2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) ボーカルと楽器分離のためにMel Band RoFormerモデルを追加しました.
|
211 |
+
|
212 |
+
### 20250226 更新
|
213 |
+
|
214 |
+
1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Windowsでの非英語ディレクトリによる問題を修正しました.
|
215 |
+
- `langsegmenter`を使用して韓国語の問題を修正.
|
216 |
+
2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Windowsでの非英語ディレクトリによる問題を修正しました.
|
217 |
+
- `langsegmenter`を使用して韓国語/日本語の問題を修正.
|
218 |
+
|
219 |
+
### 20250227 更新
|
220 |
+
|
221 |
+
- V3モデルで24Kオーディオを生成する際に発生するこもった音の問題を緩和するために、24Kから48Kのオーディオ超解像モデルを追加しました.[Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085)、[Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)で報告されています.
|
docs/ja/README.md
ADDED
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<div align="center">
|
2 |
+
|
3 |
+
<h1>GPT-SoVITS-WebUI</h1>
|
4 |
+
パワフルなFew-Shot音声変換・音声合成 WebUI.<br><br>
|
5 |
+
|
6 |
+
[](https://github.com/RVC-Boss/GPT-SoVITS)
|
7 |
+
|
8 |
+
<img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br>
|
9 |
+
|
10 |
+
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
|
11 |
+
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
12 |
+
[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
|
13 |
+
[](https://discord.gg/dnrgs5GHfG)
|
14 |
+
|
15 |
+
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | **日本語** | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
|
16 |
+
|
17 |
+
</div>
|
18 |
+
|
19 |
+
---
|
20 |
+
|
21 |
+
## 機能:
|
22 |
+
|
23 |
+
1. **Zero-Shot TTS:** たった 5 秒間の音声サンプルで、即座にテキストからその音声に変換できます.
|
24 |
+
|
25 |
+
2. **Few-Shot TTS:** わずか 1 分間のトレーニングデータでモデルを微調整し、音声のクオリティを向上.
|
26 |
+
|
27 |
+
3. **多言語サポート:** 現在、英語、日本語、韓国語、広東語、中国語をサポートしています.
|
28 |
+
|
29 |
+
4. **WebUI ツール:** 統合されたツールは、音声と伴奏 (BGM 等) の分離、トレーニングセットの自動セグメンテーション、ASR (中国語のみ)、テキストラベリング等を含むため、初心者の方でもトレーニングデータセットの作成や GPT/SoVITS モデルのトレーニング等を非常に簡単に行えます.
|
30 |
+
|
31 |
+
**[デモ動画](https://www.bilibili.com/video/BV12g4y1m7Uw)をチェック!**
|
32 |
+
|
33 |
+
声の事前学習無しかつ Few-Shot でトレーニングされたモデルのデモ:
|
34 |
+
|
35 |
+
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
|
36 |
+
|
37 |
+
**ユーザーマニュアル: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
38 |
+
|
39 |
+
## インストール
|
40 |
+
|
41 |
+
### テスト済みの環境
|
42 |
+
|
43 |
+
| Python Version | PyTorch Version | Device |
|
44 |
+
|----------------|------------------|-----------------|
|
45 |
+
| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 |
|
46 |
+
| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 |
|
47 |
+
| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 |
|
48 |
+
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
49 |
+
| Python 3.11 | PyTorch 2.6.0 | Apple silicon |
|
50 |
+
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
51 |
+
|
52 |
+
### Windows
|
53 |
+
|
54 |
+
Windows ユーザー: (Windows 10 以降でテスト済み)、[統合パッケージをダウンロード](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)し、解凍後に _go-webui.bat_ をダブルクリックすると、GPT-SoVITS-WebUI が起動します.
|
55 |
+
|
56 |
+
### Linux
|
57 |
+
|
58 |
+
```bash
|
59 |
+
conda create -n GPTSoVits python=3.9
|
60 |
+
conda activate GPTSoVits
|
61 |
+
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
62 |
+
```
|
63 |
+
|
64 |
+
### macOS
|
65 |
+
|
66 |
+
**注: Mac で GPU を使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面は CPU を使用して訓練することを強く推奨します.**
|
67 |
+
|
68 |
+
1. `xcode-select --install` を実行して、Xcode コマンドラインツールをインストールします.
|
69 |
+
2. 以下のコマンドを実行してこのプロジェクトをインストールします.
|
70 |
+
|
71 |
+
```bash
|
72 |
+
conda create -n GPTSoVits python=3.9
|
73 |
+
conda activate GPTSoVits
|
74 |
+
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
75 |
+
```
|
76 |
+
|
77 |
+
### 手動インストール
|
78 |
+
|
79 |
+
#### FFmpeg をインストールします.
|
80 |
+
|
81 |
+
##### Conda ユーザー
|
82 |
+
|
83 |
+
```bash
|
84 |
+
conda install ffmpeg
|
85 |
+
```
|
86 |
+
|
87 |
+
##### Ubuntu/Debian ユーザー
|
88 |
+
|
89 |
+
```bash
|
90 |
+
sudo apt install ffmpeg
|
91 |
+
sudo apt install libsox-dev
|
92 |
+
conda install -c conda-forge 'ffmpeg<7'
|
93 |
+
```
|
94 |
+
|
95 |
+
##### Windows ユーザー
|
96 |
+
|
97 |
+
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます.
|
98 |
+
|
99 |
+
##### MacOS ユーザー
|
100 |
+
|
101 |
+
```bash
|
102 |
+
brew install ffmpeg
|
103 |
+
```
|
104 |
+
|
105 |
+
#### 依存関係をインストールします
|
106 |
+
|
107 |
+
```bash
|
108 |
+
pip install -r extra-req.txt --no-deps
|
109 |
+
pip install -r requirementx.txt
|
110 |
+
```
|
111 |
+
|
112 |
+
### Docker の使用
|
113 |
+
|
114 |
+
#### docker-compose.yaml の設定
|
115 |
+
|
116 |
+
0. イメージのタグについて: コードベースの更新が速い割に、イメージのパッケージングとテストが遅いため、[Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(古いバージョン) で現在パッケージされている最新のイメージをご覧になり、ご自身の状況に応じて選択するか、またはご自身のニーズに応じて Dockerfile を使用してローカルでビルドしてください.
|
117 |
+
1. 環境変数:
|
118 |
+
|
119 |
+
- `is_half`: 半精度/倍精度の制御."SSL 抽出"ステップ中に`4-cnhubert/5-wav32k`ディレクトリ内の内容が正しく生成されない場合、通常これが原因です.実際の状況に応じて True または False に調整してください.
|
120 |
+
|
121 |
+
2. ボリューム設定: コンテナ内のアプリケーションのルートディレクトリは`/workspace`に設定されます.デフォルトの`docker-compose.yaml`には、アップロード/ダウンロードの内容の実例がいくつか記載されています.
|
122 |
+
3. `shm_size`: Windows の Docker Desktop のデフォルトの利用可能メモリは小さすぎるため、うまく動作しない可能性があります.状況に応じて適宜設定してください.
|
123 |
+
4. `deploy`セクションの GPU に関連する内容は、システムと実際の状況に応じて慎重に設定してください.
|
124 |
+
|
125 |
+
#### docker compose で実行する
|
126 |
+
|
127 |
+
```markdown
|
128 |
+
docker compose -f "docker-compose.yaml" up -d
|
129 |
+
```
|
130 |
+
|
131 |
+
#### docker コマンドで実行する
|
132 |
+
|
133 |
+
上記と同様に、実際の状況に基づいて対応するパラメータを変更し、次のコマンドを実行します:
|
134 |
+
|
135 |
+
```markdown
|
136 |
+
docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx
|
137 |
+
```
|
138 |
+
|
139 |
+
## 事前訓練済みモデル
|
140 |
+
|
141 |
+
**`install.sh`が正常に実行された場合、No.1,2,3 はスキップしてかまいません.**
|
142 |
+
|
143 |
+
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください.
|
144 |
+
|
145 |
+
2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください. (中国語 TTS のみ)
|
146 |
+
|
147 |
+
3. UVR5 (ボーカル/伴奏 (BGM 等) 分離 & リバーブ除去の追加機能) の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください.
|
148 |
+
|
149 |
+
- UVR5 で bs_roformer または mel_band_roformer モデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます.**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**.さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**.これにより、roformer クラスのモデルとして認識されます.
|
150 |
+
|
151 |
+
- モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**.例: mel_mand_roformer、bs_roformer.指定しない場合、設定文から特徴を照合して、モデルの種類を特定します.例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです.同様に、`kim_mel_band_roformer.ckpt`と`kim_mel_band_roformer.yaml`もペアです.
|
152 |
+
|
153 |
+
4. 中国語 ASR (追加機能) の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください.
|
154 |
+
|
155 |
+
5. 英語または日本語の ASR (追加機能) を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください.また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります.
|
156 |
+
|
157 |
+
## データセット形式
|
158 |
+
|
159 |
+
TTS アノテーション .list ファイル形式:
|
160 |
+
|
161 |
+
```
|
162 |
+
vocal_path|speaker_name|language|text
|
163 |
+
```
|
164 |
+
|
165 |
+
言語辞書:
|
166 |
+
|
167 |
+
- 'zh': 中国語
|
168 |
+
- 'ja': 日本語
|
169 |
+
- 'en': 英語
|
170 |
+
|
171 |
+
��:
|
172 |
+
|
173 |
+
```
|
174 |
+
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
|
175 |
+
```
|
176 |
+
|
177 |
+
## 微調整と推論
|
178 |
+
|
179 |
+
### WebUI を開く
|
180 |
+
|
181 |
+
#### 統合パッケージ利用者
|
182 |
+
|
183 |
+
`go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用します.
|
184 |
+
V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください.
|
185 |
+
|
186 |
+
#### その他
|
187 |
+
|
188 |
+
```bash
|
189 |
+
python webui.py <言語(オプション)>
|
190 |
+
```
|
191 |
+
|
192 |
+
V1 に切り替えたい場合は
|
193 |
+
|
194 |
+
```bash
|
195 |
+
python webui.py v1 <言語(オプション)>
|
196 |
+
```
|
197 |
+
|
198 |
+
または WebUI で手動でバージョンを切り替えてください.
|
199 |
+
|
200 |
+
### 微調整
|
201 |
+
|
202 |
+
#### パス自動補完のサポート
|
203 |
+
|
204 |
+
1. 音声パスを入力する
|
205 |
+
2. 音声を小さなチャンクに分割する
|
206 |
+
3. ノイズ除去 (オプション)
|
207 |
+
4. ASR
|
208 |
+
5. ASR転写を校正する
|
209 |
+
6. 次のタブに移動し、モデルを微調整する
|
210 |
+
|
211 |
+
### 推論 WebUI を開く
|
212 |
+
|
213 |
+
#### 統合パッケージ利用者
|
214 |
+
|
215 |
+
`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます.
|
216 |
+
|
217 |
+
#### その他
|
218 |
+
|
219 |
+
```bash
|
220 |
+
python GPT_SoVITS/inference_webui.py <言語(オプション)>
|
221 |
+
```
|
222 |
+
|
223 |
+
または
|
224 |
+
|
225 |
+
```bash
|
226 |
+
python webui.py
|
227 |
+
```
|
228 |
+
|
229 |
+
その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます.
|
230 |
+
|
231 |
+
## V2 リリースノート
|
232 |
+
|
233 |
+
新機能:
|
234 |
+
|
235 |
+
1. 韓国語と広東語をサポート
|
236 |
+
|
237 |
+
2. 最適化されたテキストフロントエンド
|
238 |
+
|
239 |
+
3. 事前学習済みモデルが 2 千時間から 5 千時間に拡張
|
240 |
+
|
241 |
+
4. 低品質の参照音声に対する合成品質の向上
|
242 |
+
|
243 |
+
[詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
244 |
+
|
245 |
+
V1 環境から V2 を使用するには:
|
246 |
+
|
247 |
+
1. `pip install -r requirements.txt`を使用していくつかのパッケージを更新
|
248 |
+
|
249 |
+
2. 最新のコードを github からクローン
|
250 |
+
|
251 |
+
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置
|
252 |
+
|
253 |
+
中国語 V2 追加: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW モデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します)
|
254 |
+
|
255 |
+
## V3 リリースノート
|
256 |
+
|
257 |
+
新機能:
|
258 |
+
|
259 |
+
1. 音色の類似性が向上し、ターゲットスピーカーを近似するために必要な学習データが少なくなりました (音色の類似性は、ファインチューニングなしでベースモデルを直接使用することで顕著に改善されます).
|
260 |
+
|
261 |
+
2. GPT モデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました.
|
262 |
+
|
263 |
+
[詳細情報はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
264 |
+
|
265 |
+
v2 環境から v3 を使用する方法:
|
266 |
+
|
267 |
+
1. `pip install -r requirements.txt` を実行して、いくつかのパッケージを更新します.
|
268 |
+
|
269 |
+
2. GitHub から最新のコードをクローンします.
|
270 |
+
|
271 |
+
3. v3 の事前学習済みモデル (s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ) を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します.
|
272 |
+
|
273 |
+
追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください.
|
274 |
+
|
275 |
+
## Todo リスト
|
276 |
+
|
277 |
+
- [x] **優先度 高:**
|
278 |
+
|
279 |
+
- [x] 日本語と英語でのローカライズ.
|
280 |
+
- [x] ユーザーガイド.
|
281 |
+
- [x] 日本語データセットと英語データセットのファインチューニングトレーニング.
|
282 |
+
|
283 |
+
- [ ] **機能:**
|
284 |
+
- [x] ゼロショット音声変換 (5 秒) /数ショット音声変換 (1 分).
|
285 |
+
- [x] TTS スピーキングスピードコントロール.
|
286 |
+
- [ ] ~~TTS の感情コントロールの強化.~~
|
287 |
+
- [ ] SoVITS トークン入力を語彙の確率分布に変更する実験.
|
288 |
+
- [x] 英語と日本語のテキストフロントエンドを改善.
|
289 |
+
- [ ] 小型と大型の TTS モデルを開発する.
|
290 |
+
- [x] Colab のスクリプト.
|
291 |
+
- [ ] トレーニングデータセットを拡張する (2k→10k).
|
292 |
+
- [x] より良い sovits ベースモデル (音質向上)
|
293 |
+
- [ ] モデルミックス
|
294 |
+
|
295 |
+
## (追加の) コマンドラインから実行する方法
|
296 |
+
|
297 |
+
コマンド ラインを使用して UVR5 の WebUI を開きます
|
298 |
+
|
299 |
+
```
|
300 |
+
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
301 |
+
```
|
302 |
+
|
303 |
+
<!-- ブラウザを開けない場合は、以下の形式に従って UVR 処理��行ってください.これはオーディオ処理に mdxnet を使用しています.
|
304 |
+
```
|
305 |
+
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
306 |
+
``` -->
|
307 |
+
|
308 |
+
コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです.
|
309 |
+
|
310 |
+
```
|
311 |
+
python audio_slicer.py \
|
312 |
+
--input_path "<path_to_original_audio_file_or_directory>" \
|
313 |
+
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
|
314 |
+
--threshold <volume_threshold> \
|
315 |
+
--min_length <minimum_duration_of_each_subclip> \
|
316 |
+
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
317 |
+
--hop_size <step_size_for_computing_volume_curve>
|
318 |
+
```
|
319 |
+
|
320 |
+
コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ)
|
321 |
+
|
322 |
+
```
|
323 |
+
python tools/asr/funasr_asr.py -i <input> -o <output>
|
324 |
+
```
|
325 |
+
|
326 |
+
ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング)
|
327 |
+
|
328 |
+
(進行状況バーは表示されません.GPU のパフォーマンスにより時間遅延が発生する可能性があります)
|
329 |
+
|
330 |
+
```
|
331 |
+
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
332 |
+
```
|
333 |
+
|
334 |
+
カスタムリストの保存パスが有効になっています
|
335 |
+
|
336 |
+
## クレジット
|
337 |
+
|
338 |
+
特に以下のプロジェクトと貢献者に感謝します:
|
339 |
+
|
340 |
+
### 理論研究
|
341 |
+
|
342 |
+
- [ar-vits](https://github.com/innnky/ar-vits)
|
343 |
+
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
344 |
+
- [vits](https://github.com/jaywalnut310/vits)
|
345 |
+
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
|
346 |
+
- [contentvec](https://github.com/auspicious3000/contentvec/)
|
347 |
+
- [hifi-gan](https://github.com/jik876/hifi-gan)
|
348 |
+
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
349 |
+
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
350 |
+
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
351 |
+
|
352 |
+
### 事前学習モデル
|
353 |
+
|
354 |
+
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
355 |
+
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
356 |
+
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
357 |
+
|
358 |
+
### 推論用テキストフロントエンド
|
359 |
+
|
360 |
+
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
361 |
+
- [split-lang](https://github.com/DoodleBears/split-lang)
|
362 |
+
- [g2pW](https://github.com/GitYCC/g2pW)
|
363 |
+
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
364 |
+
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
365 |
+
|
366 |
+
### WebUI ツール
|
367 |
+
|
368 |
+
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
369 |
+
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
370 |
+
- [SubFix](https://github.com/cronrpc/SubFix)
|
371 |
+
- [FFmpeg](https://github.com/FFmpeg/FFmpeg)
|
372 |
+
- [gradio](https://github.com/gradio-app/gradio)
|
373 |
+
- [faster-whisper](https://github.com/SYSTRAN/faster-whisper)
|
374 |
+
- [FunASR](https://github.com/alibaba-damo-academy/FunASR)
|
375 |
+
- [AP-BWE](https://github.com/yxlu-0102/AP-BWE)
|
376 |
+
|
377 |
+
@Naozumi520 さん、広東語のトレーニングセットの提供と、広東語に関する知識のご指導をいただき、感謝申し上げます.
|
378 |
+
|
379 |
+
## すべてのコントリビューターに感謝します
|
380 |
+
|
381 |
+
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
|
382 |
+
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
|
383 |
+
</a>
|
docs/ko/Changelog_KO.md
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### 20240121 업데이트
|
2 |
+
|
3 |
+
1. `config`에 `is_share`를 추가했습니다. Colab과 같은 시나리오에서는 이 값을 `True`로 설정하여 WebUI를 공개 네트워크에 매핑할 수 있습니다.
|
4 |
+
2. WebUI에 영어 시스템 번역 지원을 추가했습니다.
|
5 |
+
3. `cmd-asr`이 FunASR 모델이 포함되어 있는지 자동으로 감지합니다; 기본 디렉토리에서 찾을 수 없으면 ModelScope에서 다운로드됩니다.
|
6 |
+
4. [Issue 79](https://github.com/RVC-Boss/GPT-SoVITS/issues/79)에서 보고된 SoVITS 훈련의 ZeroDivisionError를 필터링 샘플 등으로 해결하려고 시도했습니다.
|
7 |
+
5. `TEMP` 폴더의 캐시된 오디오 파일 및 기타 파일을 정리했습니다.
|
8 |
+
6. 참조 오디오의 끝이 포함된 합성 오디오 문제를 크게 줄였습니다.
|
9 |
+
|
10 |
+
### 20240122 업데이트
|
11 |
+
|
12 |
+
1. 지나치게 짧은 출력 파일로 인해 참조 오디오가 반복되는 문제를 수정했습니다.
|
13 |
+
2. 영어 및 일본어 훈련의 네이티브 지원을 테스트했습니다 (일본어 훈련 시 루트 디렉토리에 비영어 특수 문자가 없어야 합니다).
|
14 |
+
3. 오디오 경로 확인을 개선했습니다. 잘못된 입력 경로에서 읽으려는 시도가 있을 경우, ffmpeg 오류 대신 경로가 존재하지 않는다고 보고합니다.
|
15 |
+
|
16 |
+
### 20240123 업데이트
|
17 |
+
|
18 |
+
1. Hubert 추출로 인해 NaN 오류가 발생하여 SoVITS/GPT 훈련에서 ZeroDivisionError가 발생하는 문제를 해결했습니다.
|
19 |
+
2. 추론 WebUI에서 빠른 모델 전환 지원을 추가했습니다.
|
20 |
+
3. 모델 파일 정렬 로직을 최적화했습니다.
|
21 |
+
4. 중국어 단어 분할을 위해 `jieba`를 `jieba_fast`로 교체했습니다.
|
22 |
+
|
23 |
+
### 20240126 업데이트
|
24 |
+
|
25 |
+
1. 중국어-영어 혼합 및 일본어-영어 혼합 출력 텍스트를 지원합니다.
|
26 |
+
2. 출력에 대한 선택적 분할 모드를 추가했습니다.
|
27 |
+
3. UVR5 읽기 문제 및 디렉토리 자동 탈출 문제를 수정했습니다.
|
28 |
+
4. 추론 오류를 일으키는 여러 줄 바꿈 문제를 수정했습니다.
|
29 |
+
5. 추론 WebUI 에서 중복 로그를 제거했습니다.
|
30 |
+
6. Mac에서 훈련 및 추론을 지원합니다.
|
31 |
+
7. 절반 정밀도를 지원하지 않는 GPU에 대해 자동으로 단정밀도를 강제하며, CPU 추론 시 단정밀도를 적용합니다.
|
32 |
+
|
33 |
+
### 20240128 업데이트
|
34 |
+
|
35 |
+
1. 숫자의 발음이 중국어 문자로 변환되는 문제를 수정했습니다.
|
36 |
+
2. 문장 시작 부분에서 몇 개의 문자가 누락되는 문제를 수정했습니다.
|
37 |
+
3. 비합리적인 참조 오디오 길이를 설정하여 제외했습니다.
|
38 |
+
4. GPT 훈련 시 체크포인트가 저장되지 않는 문제를 수정했습니다.
|
39 |
+
5. Dockerfile 에서 모델 다운로드 프로세스를 완료했습니다.
|
40 |
+
|
41 |
+
### 20240129 업데이트
|
42 |
+
|
43 |
+
1. 절반 정밀도 훈련에 문제가 있는 16 시리즈와 같은 GPU의 훈련 구성을 단정밀도로 변경했습니다.
|
44 |
+
2. 사용 가능한 Colab 버전을 테스트하고 업데이트했습니다.
|
45 |
+
3. 이전 버전의 FunASR 로 인해 인터페이스 정렬 오류가 발생하는 ModelScope FunASR 저장소의 git 클로닝 문제를 수정했습니다.
|
46 |
+
|
47 |
+
### 20240130 업데이트
|
48 |
+
|
49 |
+
1. 모든 경로 관련 항목에서 이중 따옴표를 자동으로 제거하여 초보자가 이중 따옴표가 포함된 경로를 복사하는 오류를 방지했습니다.
|
50 |
+
2. 중국어 및 영어 문장 부호 분할 문제를 수정하고 문장 시작과 끝에 부호를 추가했습니다.
|
51 |
+
3. 부호에 의한 분할을 추가했습니다.
|
52 |
+
|
53 |
+
### 20240201 업데이트
|
54 |
+
|
55 |
+
1. 분리 실패를 일으킨 UVR5 형식 읽기 오류를 수정했습니다.
|
56 |
+
2. 혼합된 중국어-일본어-영어 텍스트에 대한 자동 분할 및 언어 인식을 지원합니다.
|
57 |
+
|
58 |
+
### 20240202 업데이트
|
59 |
+
|
60 |
+
1. `/` 로 끝나는 ASR 경로가 파일 이름 저장 시 오류를 발생시키는 문제를 수정했습니다.
|
61 |
+
2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) 에서는 PaddleSpeech 의 Normalizer 를 도입하여 "xx.xx%" (백분율 기호)와 "元/吨"이 "元吨"으로 읽히는 문제를 "元每吨"으로 수정하고, 밑줄 오류를 수정했습니다.
|
62 |
+
|
63 |
+
### 20240207 업데이트
|
64 |
+
|
65 |
+
1. [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391) 에서 보고된 중국어 추론 품질 저하를 일으킨 언어 매개변수 혼동을 수정했습니다.
|
66 |
+
2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) 에서는 UVR5 를 높은 버전의 librosa에 맞게 조정했습니다.
|
67 |
+
3. [Commit 14a2851](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8)에서는 `is_half` 매개변수가 불리언으로 변환되지 않아 발생한 UVR5 `inf` 오류를 수정했습니다. 이로 인해 16 시리즈 GPU에서 `inf` 가 발생했습니다.
|
68 |
+
4. 영어 텍스트 프론트엔드를 최적화했습니다.
|
69 |
+
5. Gradio 종속성 문제를 수정했습니다.
|
70 |
+
6. 데이터셋 준비 시 루트 디렉토리를 비워두면 `.list` 전체 경로를 자동으로 읽도록 지원합니다.
|
71 |
+
7. 일본어와 영어에 대한 Faster Whisper ASR을 통합했습니다.
|
72 |
+
|
73 |
+
### 20240208 업데이트
|
74 |
+
|
75 |
+
1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b)에서는 Windows 10 1909와 [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232) (전통 중국어 시스템 언어)에서 GPT 훈련 멈춤 문제를 수정하려고 했습니다.
|
76 |
+
|
77 |
+
### 20240212 업데이트
|
78 |
+
|
79 |
+
1. Faster Whisper와 FunASR의 로직을 최적화하고, Faster Whisper를 미러 다운로드로 전환하여 Hugging Face 연결 문제를 피했습니다.
|
80 |
+
2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457)은 DPO Loss 실험적 훈련 옵션을 활성화하여 GPT의 반복 및 문자 누락 문제를 완화하고, 훈련 중 부정 샘플을 구성하며 여러 추론 매개변수를 추론 WebUI에서 사용할 수 있게 했습니다.
|
81 |
+
|
82 |
+
### 20240214 업데이트
|
83 |
+
|
84 |
+
1. 훈련 시 중국어 실험 이름을 지원합니다 (이전에는 오류가 발생했습니다).
|
85 |
+
2. DPO 훈련을 필수 기능 대신 선택적 기능으로 변경했습니다. 선택 시, 배치 크기가 자동으로 절반으로 줄어듭니다. 추론 WebUI에서 새로운 매개변수가 전달되지 않는 문제를 수정했습니다.
|
86 |
+
|
87 |
+
### 20240216 업데이트
|
88 |
+
|
89 |
+
1. 참조 텍스트 없이 입력을 지원합니다.
|
90 |
+
2. [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475)에서 보고된 중국어 프론트엔드의 버그를 수정했습니다.
|
91 |
+
|
92 |
+
### 20240221 업데이트
|
93 |
+
|
94 |
+
1. 데이터 처리 중 노이즈 감소 옵션을 추가했습니다 (노이즈 감소는 16kHz 샘플링 비율만 남깁니다; 배경 노이즈가 심한 경우에만 사용하십시오).
|
95 |
+
2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) 중국어 및 일본어 프론트엔드 처리를 최적화했습니다.
|
96 |
+
3. Mac CPU 추론을 MPS 대신 CPU를 사용하도록 전환하여 성능을 향상시켰습니다.
|
97 |
+
4. Colab 공개 URL 문제를 수정했습니다.
|
98 |
+
|
99 |
+
### 20240306 업데이트
|
100 |
+
|
101 |
+
1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)는 추론 속도를 50% 가속화했습니다 (RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39에서 테스트됨).
|
102 |
+
2. Faster Whisper의 비중국어 ASR을 사용할 때 중국어 FunASR 모델을 먼저 다운로드할 필요가 없습니다.
|
103 |
+
3. [PR 610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610)은 UVR5 리버브 제거 모델에서 설정이 반대로 되어 있는 문제를 수정했습니다.
|
104 |
+
4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675)는 CUDA가 없는 경우 Faster Whisper의 자동 CPU 추론을 가능하게 했습니다.
|
105 |
+
5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573)은 Mac에서 올바른 CPU 추론을 보장하기 위해 `is_half` 체크를 수정했습니다.
|
106 |
+
|
107 |
+
### 202403/202404/202405 업데이트
|
108 |
+
|
109 |
+
#### 사소한 수정:
|
110 |
+
|
111 |
+
1. 참조 텍스트 없는 모드의 문제를 수정했습니다.
|
112 |
+
2. 중국어 및 영어 텍스트 프론트엔드를 최적화했습니다.
|
113 |
+
3. API 형식을 개선했습니다.
|
114 |
+
4. CMD 형식 문제를 수정했습니다.
|
115 |
+
5. 훈련 데이터 처리 중 지원되지 않는 언어에 대한 오류 프롬프트를 추가했습니다.
|
116 |
+
6. Hubert 추출의 버그를 수정했습니다.
|
117 |
+
|
118 |
+
#### 주요 수정:
|
119 |
+
|
120 |
+
1. VQ를 고정하지 않고 SoVITS 훈련의 문제를 수정했습니다(품질 저하를 일으킬 수 있음).
|
121 |
+
2. 빠른 추론 분기를 추가했습니다.
|
122 |
+
|
123 |
+
### 20240610 업데이트
|
124 |
+
|
125 |
+
#### 사소한 수정:
|
126 |
+
|
127 |
+
1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169) 순수 구두점 및 다중 구두점 텍스트 입력 로직을 개선했습니다.
|
128 |
+
2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) UVR5에서 MDXNet 디러버브를 위한 CMD 형식을 수정하고 공백이 있는 경로를 지원했습니다.
|
129 |
+
3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159) `s2_train.py`에서 SoVITS 훈련을 위한 진행률 표시줄 로직을 수정했습니다.
|
130 |
+
|
131 |
+
#### 주요 수정:
|
132 |
+
|
133 |
+
4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) WebUI의 GPT 미세 조정이 중국어 입력 텍스트의 BERT 기능을 읽지 않아 추론과 불일치 및 잠재적 품질 저하를 일으키는 문제를 수정했습니다.
|
134 |
+
**주의: 이전에 많은 양의 데이터로 미세 조정한 경우 품질을 향상시키기 위해 모델을 다시 조정하는 것이 좋습니다.**
|
135 |
+
|
136 |
+
### 20240706 업데이트
|
137 |
+
|
138 |
+
#### 사소한 수정:
|
139 |
+
|
140 |
+
1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) CPU 추론에서 기본 배치 크기 소수점 문제를 수정했습니다.
|
141 |
+
2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) 노이즈 제거 또는 ASR이 예외를 만나면 모든 보류 중인 오디오 파일이 종료되는 문제를 수정했습니다.
|
142 |
+
3. [PR 1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) 구두점으로 분할할 때 소수점 분할 문제를 수정했습니다.
|
143 |
+
4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) 다중 GPU 훈련을 위한 다중 프로세스 저장 로직을 수정했습니다.
|
144 |
+
5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) 불필요한 `my_utils`를 제거했습니다.
|
145 |
+
|
146 |
+
#### 주요 수정:
|
147 |
+
|
148 |
+
6. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)의 가속 추론 코드가 검증되어 메인 브랜치에 병합되었으며, 기본과 일관된 추론 효과를 보장합니다.
|
149 |
+
또한 참조 텍스트 없는 모드에서 가속 추론을 지원합니다.
|
150 |
+
|
151 |
+
**향후 업데이트에서는 `fast_inference` 브랜치의 변경 사항의 일관성을 계속 검증할 것입니다**.
|
152 |
+
|
153 |
+
### 20240727 업데이트
|
154 |
+
|
155 |
+
#### 사소한 수정:
|
156 |
+
|
157 |
+
1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) 불필요한 i18n 코드를 정리했습니다.
|
158 |
+
2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) 사용자 파일 경로의 후행 슬래시가 명령줄 오류를 일으키는 문제를 수정했습니다.
|
159 |
+
3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) GPT 훈련의 단계 계산 로직을 수정했습니다.
|
160 |
+
|
161 |
+
#### 주요 수정:
|
162 |
+
|
163 |
+
4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) 합성을 위한 음성 속도 조절을 지원했습니다.
|
164 |
+
음성 속도만 조절하면서 무작위성을 고정할 수 있습니다.
|
165 |
+
|
166 |
+
### 20240806 업데이트
|
167 |
+
|
168 |
+
1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) BS RoFormer 보컬 반주 분리 모델에 대한 지원을 추가했습니다. [Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) FP16 추론을 활성화했습니다.
|
169 |
+
2. 중국어 텍스트 프론트엔드를 개선했습니다.
|
170 |
+
- [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) 중국어 다의자 지원 (v2 전용);
|
171 |
+
- [PR 987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987) 추가된 양자;
|
172 |
+
- [PR 1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) 사칙연산 및 기본 수학 공식을 지원합니다;
|
173 |
+
- [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) 혼합 텍스트 오류를 수정했습니다.
|
174 |
+
3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) WebUI 에서 오디오를 처리할 때 경로를 자동으로 채웠습니다.
|
175 |
+
4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) GPU 인식 로직을 최적화했습니다.
|
176 |
+
5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) 광동어 ASR 지원을 추가했습니다.
|
177 |
+
6. GPT-SoVITS v2 지원을 추가했습니다.
|
178 |
+
7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) 타이밍 로직을 최적화했습니다.
|
179 |
+
|
180 |
+
### 20240821 업데이트
|
181 |
+
|
182 |
+
1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) `fast_inference` 브랜치를 메인 브랜치에 병합.
|
183 |
+
2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) SSML 태그를 사용하여 숫자, 전화번호, 날짜 및 시간 최적화 지원.
|
184 |
+
3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) API 수정 및 최적화.
|
185 |
+
4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) 믹싱을 위한 참조 오디오를 하나만 업로드할 수 있는 버그 수정, 다양한 데이터셋 검사 추가 및 파일이 누락된 경우 경고 팝업.
|
186 |
+
|
187 |
+
### 20250211 업데이트
|
188 |
+
|
189 |
+
- [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) GPT-SoVITS v3 모델 추가, SoVITS v3의 파인튜닝에는 14GB GPU 메모리가 필요합니다.
|
190 |
+
|
191 |
+
### 20250212 업데이트
|
192 |
+
|
193 |
+
- [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) SoVITS v3의 파인튜닝에 그라디언트 체크포인트 추가, 12GB GPU 메모리가 필요합니다.
|
194 |
+
|
195 |
+
### 20250214 업데이트
|
196 |
+
|
197 |
+
- [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) 다국어 혼합 텍스트 분할 전략 **A** 최적화.
|
198 |
+
- `split-lang`을 언어 분할 도구로 추가하여 다국어 혼합 텍스트의 분할 능력을 향상시켰습니다.
|
199 |
+
|
200 |
+
### 20250217 업데이트
|
201 |
+
|
202 |
+
- [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) 텍스트 내 숫자와 영어 처리 로직 최적화.
|
203 |
+
|
204 |
+
### 20250218 업데이트
|
205 |
+
|
206 |
+
- [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) 다국어 혼합 텍스트 분할 전략 **B** 최적화.
|
207 |
+
|
208 |
+
### 20250223 업데이트
|
209 |
+
|
210 |
+
1. SoVITS V3의 파인튜닝에 LoRA 훈련이 지원됩니다. 8GB GPU 메모리가 필요하며, 전체 매개변수 파인튜닝보다 더 나은 결과를 제공합니다.
|
211 |
+
2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) 보컬 및 악기 분리를 위해 Mel Band RoFormer 모델 추가.
|
212 |
+
|
213 |
+
### 20250226 업데이트
|
214 |
+
|
215 |
+
1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Windows에서 비영어 디렉토리로 인한 문제 수정.
|
216 |
+
- 한국어에 대한 `langsegmenter` 사용 문제 수정.
|
217 |
+
2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Windows에서 비영어 디렉토리로 인한 문제 수정.
|
218 |
+
- 한국어/일본어에 대한 `langsegmenter` 사용 문제 수정.
|
219 |
+
|
220 |
+
### 20250227 업데이트
|
221 |
+
|
222 |
+
- V3 모델로 24K 오디오를 생성할 때 발생하는 음성 뭉침 문제를 완화하기 위해, 24K에서 48K로의 오디오 초해상도 모델을 추가했습니다. [Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117)에서 보고된 문제입니다.
|
docs/ko/README.md
ADDED
@@ -0,0 +1,389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<div align="center">
|
2 |
+
|
3 |
+
<h1>GPT-SoVITS-WebUI</h1>
|
4 |
+
소량의 데이터로 음성 변환 및 음성 합성을 지원하는 강력한 WebUI.<br><br>
|
5 |
+
|
6 |
+
[](https://github.com/RVC-Boss/GPT-SoVITS)
|
7 |
+
|
8 |
+
<img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br>
|
9 |
+
|
10 |
+
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
|
11 |
+
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
12 |
+
[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
|
13 |
+
[](https://discord.gg/dnrgs5GHfG)
|
14 |
+
|
15 |
+
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | **한국어** | [**Türkçe**](../tr/README.md)
|
16 |
+
|
17 |
+
</div>
|
18 |
+
|
19 |
+
---
|
20 |
+
|
21 |
+
## 기능:
|
22 |
+
|
23 |
+
1. **제로샷 텍스트 음성 변환 (TTS):** 5초의 음성 샘플을 입력하면 즉시 텍스트를 음성으로 변환할 수 있습니다.
|
24 |
+
|
25 |
+
2. **소량의 데이터 TTS:** 1분의 훈련 데이터만으로 모델을 미세 조정하여 음성 유사도와 실제감을 향상시킬 수 있습니다.
|
26 |
+
|
27 |
+
3. **다국어 지원:** 훈련 데이터셋과 다른 언어의 추론을 지원하며, 현재 영어, 일본어, 중국어, 광둥어, 한국어를 지원합니다.
|
28 |
+
|
29 |
+
4. **WebUI 도구:** 음성 반주 분리, 자동 훈련 데이터셋 분할, 중국어 자동 음성 인식(ASR) 및 텍스트 주석 등의 도구를 통합하여 초보자가 훈련 데이터셋과 GPT/SoVITS 모델을 생성하는 데 도움을 줍니다.
|
30 |
+
|
31 |
+
**데모 비디오를 확인하세요! [demo video](https://www.bilibili.com/video/BV12g4y1m7Uw)**
|
32 |
+
|
33 |
+
보지 못한 발화자의 퓨샷(few-shot) 파인튜닝 데모:
|
34 |
+
|
35 |
+
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
|
36 |
+
|
37 |
+
**사용자 설명서: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
38 |
+
|
39 |
+
## 설치
|
40 |
+
|
41 |
+
### 테스트 통과 환경
|
42 |
+
|
43 |
+
| Python Version | PyTorch Version | Device |
|
44 |
+
|----------------|------------------|-----------------|
|
45 |
+
| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 |
|
46 |
+
| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 |
|
47 |
+
| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 |
|
48 |
+
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
49 |
+
| Python 3.11 | PyTorch 2.6.0 | Apple silicon |
|
50 |
+
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
51 |
+
|
52 |
+
### Windows
|
53 |
+
|
54 |
+
Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다운로드](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)한 후 압축을 풀고 _go-webui.bat_ 파일을 더블 클릭하면 GPT-SoVITS-WebUI를 시작할 수 있습니다.
|
55 |
+
|
56 |
+
### Linux
|
57 |
+
|
58 |
+
```bash
|
59 |
+
conda create -n GPTSoVits python=3.9
|
60 |
+
conda activate GPTSoVits
|
61 |
+
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
62 |
+
```
|
63 |
+
|
64 |
+
### macOS
|
65 |
+
|
66 |
+
**주의: Mac에서 GPU로 훈련된 모델은 다른 OS에서 훈련된 모델에 비해 품질이 낮습니다. 해당 문제를 해결하기 전까지 MacOS에선 CPU를 사용하여 훈련을 진행합니다.**
|
67 |
+
|
68 |
+
1. `xcode-select --install`을 실행하여 Xcode 커맨드라인 도구를 설치하세요.
|
69 |
+
2. 다음 명령어를 실행하여 이 프로젝트를 설치하세요.
|
70 |
+
|
71 |
+
```bash
|
72 |
+
conda create -n GPTSoVits python=3.9
|
73 |
+
conda activate GPTSoVits
|
74 |
+
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
75 |
+
```
|
76 |
+
|
77 |
+
### 수동 설치
|
78 |
+
|
79 |
+
#### FFmpeg 설치
|
80 |
+
|
81 |
+
##### Conda 사용자
|
82 |
+
|
83 |
+
```bash
|
84 |
+
conda install ffmpeg
|
85 |
+
```
|
86 |
+
|
87 |
+
##### Ubuntu/Debian 사용자
|
88 |
+
|
89 |
+
```bash
|
90 |
+
sudo apt install ffmpeg
|
91 |
+
sudo apt install libsox-dev
|
92 |
+
conda install -c conda-forge 'ffmpeg<7'
|
93 |
+
```
|
94 |
+
|
95 |
+
##### Windows 사용자
|
96 |
+
|
97 |
+
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe)와 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe)를 GPT-SoVITS root 디렉토리에 넣습니다.
|
98 |
+
|
99 |
+
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용)
|
100 |
+
|
101 |
+
##### MacOS 사용자
|
102 |
+
|
103 |
+
```bash
|
104 |
+
brew install ffmpeg
|
105 |
+
```
|
106 |
+
|
107 |
+
#### 의존성 설치
|
108 |
+
|
109 |
+
```bash
|
110 |
+
pip install -r extra-req.txt --no-deps
|
111 |
+
pip install -r requirements.txt
|
112 |
+
```
|
113 |
+
|
114 |
+
### Docker에서 사용
|
115 |
+
|
116 |
+
#### docker-compose.yaml 설정
|
117 |
+
|
118 |
+
0. 이미지 태그: 코드 저장소가 빠르게 업데이트되고 패키지가 느리게 빌드되고 테스트되므로, 현재 빌드된 최신 도커 이미지를 [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(오래된 버전) 에서 확인하고 필요에 따라 Dockerfile을 사용하여 로컬��서 빌드할 수 있습니다.
|
119 |
+
|
120 |
+
1. 환경 변수:
|
121 |
+
|
122 |
+
- is_half: 반정밀/배정밀 제어. "SSL 추출" 단계에서 4-cnhubert/5-wav32k 디렉토리의 내용을 올바르게 생성할 수 없는 경우, 일반적으로 이것 때문입니다. 실제 상황에 따라 True 또는 False로 조정할 수 있습니다.
|
123 |
+
|
124 |
+
2. 볼륨 설정, 컨테이너 내의 애플리케이션 루트 디렉토리를 /workspace로 설정합니다. 기본 docker-compose.yaml에는 실제 예제가 나열되어 있으므로 업로드/다운로드를 쉽게 할 수 있습니다.
|
125 |
+
|
126 |
+
3. shm_size: Windows의 Docker Desktop의 기본 사용 가능한 메모리가 너무 작아 오류가 발생할 수 있으므로 실제 상황에 따라 조정합니다.
|
127 |
+
|
128 |
+
4. deploy 섹션의 gpu 관련 내용은 시스템 및 실제 상황에 따라 조정합니다.
|
129 |
+
|
130 |
+
#### docker compose로 실행
|
131 |
+
|
132 |
+
```
|
133 |
+
docker compose -f "docker-compose.yaml" up -d
|
134 |
+
```
|
135 |
+
|
136 |
+
#### docker 명령으로 실행
|
137 |
+
|
138 |
+
위와 동일하게 실제 상황에 맞게 매개변수를 수정한 다음 다음 명령을 실행합니다:
|
139 |
+
|
140 |
+
```
|
141 |
+
docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx
|
142 |
+
```
|
143 |
+
|
144 |
+
## 사전 학습된 모델
|
145 |
+
|
146 |
+
**`install.sh`가 성공적으로 실행되면 No.1,2,3 은 건너뛰어도 됩니다.**
|
147 |
+
|
148 |
+
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 에서 사전 학습된 모델을 다운로드하고, `GPT_SoVITS/pretrained_models` 디렉토리에 배치하세요.
|
149 |
+
|
150 |
+
2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 에서 모델을 다운로드하고 압축을 풀어 `G2PWModel`로 이름을 변경한 후, `GPT_SoVITS/text` 디렉토리에 배치하세요. (중국어 TTS 전용)
|
151 |
+
|
152 |
+
3. UVR5 (보컬/반주 분리 & 잔향 제거 추가 기능)의 경우, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 에서 모델을 다운로드하고 `tools/uvr5/uvr5_weights` 디렉토리에 배치하세요.
|
153 |
+
|
154 |
+
- UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **"roformer"**가 포함되어야 roformer 클래스의 모델로 인식됩니다.
|
155 |
+
|
156 |
+
- 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt`와 `kim_mel_band_roformer.yaml`도 한 쌍입니다.
|
157 |
+
|
158 |
+
4. 중국어 ASR (추가 기능)의 경우, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 및 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 에서 모델을 다운로드하고, `tools/asr/models` 디렉토리에 배치하세요.
|
159 |
+
|
160 |
+
5. 영어 또는 일본어 ASR (추가 기능)의 경우, [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 에서 모델을 다운로드하고, `tools/asr/models` 디렉토리에 배치하세요. 또한, [다른 모델](https://huggingface.co/Systran) 은 더 적은 디스크 용량으로 비슷한 효과를 가질 수 있습니다.
|
161 |
+
|
162 |
+
## 데이터셋 형식
|
163 |
+
|
164 |
+
텍스트 음성 합성(TTS) 주석 .list 파일 형식:
|
165 |
+
|
166 |
+
```
|
167 |
+
vocal_path|speaker_name|language|text
|
168 |
+
```
|
169 |
+
|
170 |
+
언어 사전:
|
171 |
+
|
172 |
+
- 'zh': 중국어
|
173 |
+
- 'ja': 일본어
|
174 |
+
- 'en': 영어
|
175 |
+
|
176 |
+
예시:
|
177 |
+
|
178 |
+
```
|
179 |
+
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
|
180 |
+
```
|
181 |
+
|
182 |
+
## 미세 조정 및 추론
|
183 |
+
|
184 |
+
### WebUI 열기
|
185 |
+
|
186 |
+
#### 통합 패키지 사용자
|
187 |
+
|
188 |
+
`go-webui.bat`을 더블 클릭하거나 `go-webui.ps1`를 사용하십시오.
|
189 |
+
V1으로 전환하려면, `go-webui-v1.bat`을 더블 클릭하거나 `go-webui-v1.ps1`를 사용하십시오.
|
190 |
+
|
191 |
+
#### 기타
|
192 |
+
|
193 |
+
```bash
|
194 |
+
python webui.py <언어(옵션)>
|
195 |
+
```
|
196 |
+
|
197 |
+
V1으로 전환하려면,
|
198 |
+
|
199 |
+
```bash
|
200 |
+
python webui.py v1 <언어(옵션)>
|
201 |
+
```
|
202 |
+
|
203 |
+
또는 WebUI에서 수동으로 버전을 전환하십시오.
|
204 |
+
|
205 |
+
### 미세 조정
|
206 |
+
|
207 |
+
#### 경로 자동 채우기가 지원됩니다
|
208 |
+
|
209 |
+
1. 오디�� 경로를 입력하십시오.
|
210 |
+
2. 오디오를 작은 청크로 분할하십시오.
|
211 |
+
3. 노이즈 제거(옵션)
|
212 |
+
4. ASR 수행
|
213 |
+
5. ASR 전사를 교정하십시오.
|
214 |
+
6. 다음 탭으로 이동하여 모델을 미세 조정하십시오.
|
215 |
+
|
216 |
+
### 추론 WebUI 열기
|
217 |
+
|
218 |
+
#### 통합 패키지 사용자
|
219 |
+
|
220 |
+
`go-webui-v2.bat`을 더블 클릭하거나 `go-webui-v2.ps1`를 사용한 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
|
221 |
+
|
222 |
+
#### 기타
|
223 |
+
|
224 |
+
```bash
|
225 |
+
python GPT_SoVITS/inference_webui.py <언어(옵션)>
|
226 |
+
```
|
227 |
+
|
228 |
+
또는
|
229 |
+
|
230 |
+
```bash
|
231 |
+
python webui.py
|
232 |
+
```
|
233 |
+
|
234 |
+
그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
|
235 |
+
|
236 |
+
## V2 릴리스 노트
|
237 |
+
|
238 |
+
새로운 기능:
|
239 |
+
|
240 |
+
1. 한국어 및 광둥어 지원
|
241 |
+
|
242 |
+
2. 최적화된 텍스트 프론트엔드
|
243 |
+
|
244 |
+
3. 사전 학습 모델이 2천 시간에서 5천 시간으로 확장
|
245 |
+
|
246 |
+
4. 저품질 참조 오디오에 대한 합성 품질 향상
|
247 |
+
|
248 |
+
[자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
249 |
+
|
250 |
+
V1 환경에서 V2를 사용하려면:
|
251 |
+
|
252 |
+
1. `pip install -r requirements.txt`를 사용하여 일부 패키지 업데이트
|
253 |
+
|
254 |
+
2. github에서 최신 코드를 클론하십시오.
|
255 |
+
|
256 |
+
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`에 넣으십시오.
|
257 |
+
|
258 |
+
중국어 V2 추가: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.)
|
259 |
+
|
260 |
+
## V3 릴리스 노트
|
261 |
+
|
262 |
+
새로운 기능:
|
263 |
+
|
264 |
+
1. 음색 유사성이 더 높아져 목표 음성에 대한 학습 데이터가 적게 필요합니다. (기본 모델을 직접 사용하여 미세 조정 없이 음색 유사성이 크게 향상됩니다.)
|
265 |
+
|
266 |
+
2. GPT 모델이 더 안정적이며 반복 및 생략이 적고, 더 풍부한 감정 표현을 가진 음성을 생성하기가 더 쉽습니다.
|
267 |
+
|
268 |
+
[자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
269 |
+
|
270 |
+
v2 환경에서 v3 사용하기:
|
271 |
+
|
272 |
+
1. `pip install -r requirements.txt`로 일부 패키지를 업데이트합니다.
|
273 |
+
|
274 |
+
2. 최신 코드를 github 에서 클론합니다.
|
275 |
+
|
276 |
+
3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS\pretrained_models` 폴더에 넣습니다.
|
277 |
+
|
278 |
+
추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
|
279 |
+
|
280 |
+
## 할 일 목록
|
281 |
+
|
282 |
+
- [x] **최우선순위:**
|
283 |
+
|
284 |
+
- [x] 일본어 및 영어 지역화.
|
285 |
+
- [x] 사용자 가이드.
|
286 |
+
- [x] 일본어 및 영어 데이터셋 미세 조정 훈련.
|
287 |
+
|
288 |
+
- [ ] **기능:**
|
289 |
+
|
290 |
+
- [x] 제로샷 음성 변환 (5초) / 소량의 음성 변환 (1분).
|
291 |
+
- [x] TTS 속도 제어.
|
292 |
+
- [ ] ~~향상된 TTS 감정 제어.~~
|
293 |
+
- [ ] SoVITS 토큰 입력을 단어 확률 분포로 변경해 보세요.
|
294 |
+
- [x] 영어 및 일본어 텍스트 프론트 엔드 개선.
|
295 |
+
- [ ] 작은 크기와 큰 크기의 TTS 모델 개발.
|
296 |
+
- [x] Colab 스크립트.
|
297 |
+
- [ ] 훈련 데이터셋 확장 (2k 시간에서 10k 시간).
|
298 |
+
- [x] 더 나은 sovits 기본 모델 (향상된 오디오 품질).
|
299 |
+
- [ ] 모델 블렌딩.
|
300 |
+
|
301 |
+
## (추가적인) 명령줄에서 실행하는 방법
|
302 |
+
|
303 |
+
명령줄을 사용하여 UVR5용 WebUI 열기
|
304 |
+
|
305 |
+
```
|
306 |
+
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
307 |
+
```
|
308 |
+
|
309 |
+
<!-- 브라우저를 열 수 없는 경우 UVR 처리를 위해 아래 형식을 따르십시오. 이는 오디오 처리를 위해 mdxnet을 사용하는 것입니다.
|
310 |
+
```
|
311 |
+
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
312 |
+
``` -->
|
313 |
+
|
314 |
+
명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다.
|
315 |
+
|
316 |
+
```
|
317 |
+
python audio_slicer.py \
|
318 |
+
--input_path "<path_to_original_audio_file_or_directory>" \
|
319 |
+
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
|
320 |
+
--threshold <volume_threshold> \
|
321 |
+
--min_length <minimum_duration_of_each_subclip> \
|
322 |
+
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
323 |
+
--hop_size <step_size_for_computing_volume_curve>
|
324 |
+
```
|
325 |
+
|
326 |
+
명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당).
|
327 |
+
|
328 |
+
```
|
329 |
+
python tools/asr/funasr_asr.py -i <input> -o <output>
|
330 |
+
```
|
331 |
+
|
332 |
+
ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다.
|
333 |
+
|
334 |
+
(진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음)
|
335 |
+
|
336 |
+
```
|
337 |
+
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
338 |
+
```
|
339 |
+
|
340 |
+
사용자 정의 목록 저장 경로가 활성화되었습니다.
|
341 |
+
|
342 |
+
## 감사의 말
|
343 |
+
|
344 |
+
다음 프로젝트와 기여자들에게 특별히 감사드립니다:
|
345 |
+
|
346 |
+
### 이론 연구
|
347 |
+
|
348 |
+
- [ar-vits](https://github.com/innnky/ar-vits)
|
349 |
+
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
350 |
+
- [vits](https://github.com/jaywalnut310/vits)
|
351 |
+
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
|
352 |
+
- [contentvec](https://github.com/auspicious3000/contentvec/)
|
353 |
+
- [hifi-gan](https://github.com/jik876/hifi-gan)
|
354 |
+
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
355 |
+
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
356 |
+
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
357 |
+
|
358 |
+
### 사전 학습 모델
|
359 |
+
|
360 |
+
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
361 |
+
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
362 |
+
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
363 |
+
|
364 |
+
### 추론용 텍스트 프론트엔드
|
365 |
+
|
366 |
+
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
367 |
+
- [split-lang](https://github.com/DoodleBears/split-lang)
|
368 |
+
- [g2pW](https://github.com/GitYCC/g2pW)
|
369 |
+
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
370 |
+
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
371 |
+
|
372 |
+
### WebUI 도구
|
373 |
+
|
374 |
+
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
375 |
+
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
376 |
+
- [SubFix](https://github.com/cronrpc/SubFix)
|
377 |
+
- [FFmpeg](https://github.com/FFmpeg/FFmpeg)
|
378 |
+
- [gradio](https://github.com/gradio-app/gradio)
|
379 |
+
- [faster-whisper](https://github.com/SYSTRAN/faster-whisper)
|
380 |
+
- [FunASR](https://github.com/alibaba-damo-academy/FunASR)
|
381 |
+
- [AP-BWE](https://github.com/yxlu-0102/AP-BWE)
|
382 |
+
|
383 |
+
@Naozumi520 님께 감사드립니다. 광둥어 학습 자료를 제공해 주시고, 광둥어 관련 지식을 지도해 주셔서 감사합니다.
|
384 |
+
|
385 |
+
## 모든 기여자들에게 감사드립니다 ;)
|
386 |
+
|
387 |
+
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
|
388 |
+
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
|
389 |
+
</a>
|
docs/tr/Changelog_TR.md
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### 20240121 Güncellemesi
|
2 |
+
|
3 |
+
1. `config`e `is_share` eklendi. Colab gibi senaryolarda, WebUI'yi halka açık ağa yönlendirmek için bu değeri `True` olarak ayarlayabilirsiniz.
|
4 |
+
2. WebUI'ye İngilizce sistem çeviri desteği eklendi.
|
5 |
+
3. `cmd-asr`, FunASR modelinin dahil olup olmadığını otomatik olarak tespit eder; eğer varsayılan dizinde bulunamazsa, ModelScope'dan indirilecektir.
|
6 |
+
4. [Issue 79](https://github.com/RVC-Boss/GPT-SoVITS/issues/79)de bildirilen SoVITS eğitimindeki ZeroDivisionError'u sıfır uzunlukta örnekleri filtreleyerek düzeltmeye çalıştık.
|
7 |
+
5. `TEMP` klasöründeki önbelleğe alınmış ses dosyaları ve diğer dosyaları temizledik.
|
8 |
+
6. Referans sesinin sonunu içeren sentezlenmiş ses sorununu önemli ölçüde azalttık.
|
9 |
+
|
10 |
+
### 20240122 Güncellemesi
|
11 |
+
|
12 |
+
1. Aşırı kısa çıktı dosyalarının referans sesini tekrarlamasına neden olan sorun giderildi.
|
13 |
+
2. İngilizce ve Japonca eğitim için yerel destek test edildi (Japonca eğitim için kök dizinin İngilizce olmayan özel karakterlerden arındırılmış olması gerekir).
|
14 |
+
3. Ses yolu denetimi iyileştirildi. Yanlış bir giriş yolundan okumaya çalışıldığında, ffmpeg hatası yerine yolun mevcut olmadığını bildirir.
|
15 |
+
|
16 |
+
### 20240123 Güncellemesi
|
17 |
+
|
18 |
+
1. Hubert çıkarımının NaN hatalarına neden olup SoVITS/GPT eğitiminde ZeroDivisionError'a yol açtığı sorun çözüldü.
|
19 |
+
2. İnferans WebUI'de hızlı model değiştirme desteği eklendi.
|
20 |
+
3. Model dosyası sıralama mantığı optimize edildi.
|
21 |
+
4. Çince kelime ayrımı için `jieba` `jieba_fast` ile değiştirildi.
|
22 |
+
|
23 |
+
### 20240126 Güncellemesi
|
24 |
+
|
25 |
+
1. Çince-İngilizce ve Japonca-İngilizce karışık çıktı metinleri için destek eklendi.
|
26 |
+
2. Çıktı için isteğe bağlı bir bölme modu eklendi.
|
27 |
+
3. UVR5'in dizinlerden otomatik olarak çıkmasına neden olan okuma sorununu düzelttik.
|
28 |
+
4. Çeşitli yeni satır sorunlarını düzelterek çıkarım hatalarını giderdik.
|
29 |
+
5. Çıkarım WebUI'deki gereksiz günlükleri kaldırdık.
|
30 |
+
6. Mac'te eğitim ve çıkarım desteği eklendi.
|
31 |
+
7. Yarım hassasiyeti desteklemeyen GPU'lar için otomatik olarak tek hassasiyet zorlandı; CPU çıkarımında tek hassasiyet uygulandı.
|
32 |
+
|
33 |
+
### 20240128 Güncellemesi
|
34 |
+
|
35 |
+
1. Sayıların Çince karakterlere dönüştürülmesiyle ilgili sorunu düzelttik.
|
36 |
+
2. Cümlelerin başındaki birkaç karakterin yutulması sorununu düzelttik.
|
37 |
+
3. Mantıksız referans ses uzunluklarını sınırlamalar koyarak hariç tuttuk.
|
38 |
+
4. GPT eğitiminin kontrol noktalarını kaydetmemesi sorununu düzelttik.
|
39 |
+
5. Dockerfile'da model indirme sürecini tamamladık.
|
40 |
+
|
41 |
+
### 20240129 Güncellemesi
|
42 |
+
|
43 |
+
1. Yarım hassasiyet eğitimi ile ilgili sorun yaşayan 16 serisi gibi GPU'lar için eğitim yapılandırmalarını tek hassasiyete değiştirdik.
|
44 |
+
2. Mevcut Colab sürümünü test ettik ve güncelledik.
|
45 |
+
3. Eski sürüm FunASR ile ModelScope FunASR deposunun git klonlanmasıyla oluşan arayüz hizalama hatalarını düzelttik.
|
46 |
+
|
47 |
+
### 20240130 Güncellemesi
|
48 |
+
|
49 |
+
1. Çift tırnaklarla yol kopyalama hatalarını önlemek için tüm yol ile ilgili girdilerden otomatik olarak çift tırnakları kaldırdık.
|
50 |
+
2. Çince ve İngilizce noktalama işaretlerini ayırma sorunlarını düzelttik ve cümlelerin başına ve sonuna noktalama işaretleri ekledik.
|
51 |
+
3. Noktalama işaretlerine göre ayırma özelliğini ekledik.
|
52 |
+
|
53 |
+
### 20240201 Güncellemesi
|
54 |
+
|
55 |
+
1. Ayrılma hatalarına neden olan UVR5 format okuma hatasını düzelttik.
|
56 |
+
2. Karışık Çince-Japonca-İngilizce metinler için otomatik segmentasyon ve dil tanıma desteği sağladık.
|
57 |
+
|
58 |
+
### 20240202 Güncellemesi
|
59 |
+
|
60 |
+
1. `/` ile biten bir ASR yolunun dosya adını kaydetme hatasına neden olma sorununu düzelttik.
|
61 |
+
2. [PR 377](https://github.com/RVC-Boss/GPT-SoVITS/pull/377) PaddleSpeech'in Normalizer'ını tanıtarak "xx.xx%" (yüzde sembolleri) ve "元/吨" ifadesinin "元吨" yerine "元每吨" olarak okunması gibi sorunları düzelttik ve alt çizgi hatalarını giderdik.
|
62 |
+
|
63 |
+
### 20240207 Güncellemesi
|
64 |
+
|
65 |
+
1. [Issue 391](https://github.com/RVC-Boss/GPT-SoVITS/issues/391)de bildirilen dil parametresi karışıklığının Çinçe çıkarım kalitesini düşürme sorununu düzelttik.
|
66 |
+
2. [PR 403](https://github.com/RVC-Boss/GPT-SoVITS/pull/403) ile UVR5'i daha yüksek versiyonlarda librosa'ya uyarladık.
|
67 |
+
3. [Commit 14a2851](https://github.com/RVC-Boss/GPT-SoVITS/commit/14a285109a521679f8846589c22da8f656a46ad8) `is_half` parametresinin booleana dönüştürülmemesi nedeniyle sürekli yarım hassasiyet çıkarımı yaparak 16 serisi GPU'larda `inf` hatasına neden olan UVR5 inf hatasını düzelttik.
|
68 |
+
4. İngilizce metin önyüzünü optimize ettik.
|
69 |
+
5. Gradio bağımlılıklarını düzelttik.
|
70 |
+
6. Veri seti hazırlığı sırasında kök dizini boş bırakıldığında `.list` tam yollarının otomatik olarak okunmasını destekledik.
|
71 |
+
7. Japonca ve İngilizce için Faster Whisper ASR'yi entegre ettik.
|
72 |
+
|
73 |
+
### 20240208 Güncellemesi
|
74 |
+
|
75 |
+
1. [Commit 59f35ad](https://github.com/RVC-Boss/GPT-SoVITS/commit/59f35adad85815df27e9c6b33d420f5ebfd8376b) ile Windows 10 1909'da ve [Issue 232](https://github.com/RVC-Boss/GPT-SoVITS/issues/232)de (Geleneksel Çince Sistem Dili) bildirilen GPT eğitim durma sorununu düzeltmeye çalıştık.
|
76 |
+
|
77 |
+
### 20240212 Güncellemesi
|
78 |
+
|
79 |
+
1. Faster Whisper ve FunASR için mantığı optimize ettik, Hugging Face bağlantı sorunlarını önlemek için Faster Whisper'ı ayna indirmelere yönlendirdik.
|
80 |
+
2. [PR 457](https://github.com/RVC-Boss/GPT-SoVITS/pull/457) GPT tekrarı ve eksik karakterleri azaltmak için eğitim sırasında negatif örnekler oluşturarak deneysel DPO Loss eğitim seçeneğini etkinleştirdi ve çıkarım WebUI'de çeşitli çıkarım parametrelerini kullanılabilir hale getirdi.
|
81 |
+
|
82 |
+
### 20240214 Güncellemesi
|
83 |
+
|
84 |
+
1. Eğitimde Çince deney adlarını destekledik (önceden hatalara neden oluyordu).
|
85 |
+
2. DPO eğitimini zorunlu yerine isteğe bağlı bir özellik yaptık. Seçilirse, parti boyutu otomatik olarak yarıya indirilir. Çıkarım WebUI'de yeni parametrelerin iletilmemesi sorunlarını düzelttik.
|
86 |
+
|
87 |
+
### 20240216 Güncellemesi
|
88 |
+
|
89 |
+
1. Referans metin olmadan girişi destekledik.
|
90 |
+
2. [Issue 475](https://github.com/RVC-Boss/GPT-SoVITS/issues/475) de bildirilen Çince önyüz hatalarını düzelttik.
|
91 |
+
|
92 |
+
### 20240221 Güncellemesi
|
93 |
+
|
94 |
+
1. Veri işleme sırasında bir gürültü azaltma seçeneği ekledik (gürültü azaltma sadece 16kHz örnekleme hızını bırakır; yalnızca arka plan gürültüsü önemliyse kullanın).
|
95 |
+
2. [PR 559](https://github.com/RVC-Boss/GPT-SoVITS/pull/559), [PR 556](https://github.com/RVC-Boss/GPT-SoVITS/pull/556), [PR 532](https://github.com/RVC-Boss/GPT-SoVITS/pull/532), [PR 507](https://github.com/RVC-Boss/GPT-SoVITS/pull/507), [PR 509](https://github.com/RVC-Boss/GPT-SoVITS/pull/509) ile Çince ve Japonca önyüz işlemesini optimize ettik.
|
96 |
+
3. Mac CPU çıkarımını daha hızlı performans için MPS yerine CPU kullanacak şekilde değiştirdik.
|
97 |
+
4. Colab genel URL sorununu düzelttik.
|
98 |
+
|
99 |
+
### 20240306 Güncellemesi
|
100 |
+
|
101 |
+
1. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) çıkarımı %50 hızlandırdı (RTX3090 + PyTorch 2.2.1 + CU11.8 + Win10 + Py39 üzerinde test edildi).
|
102 |
+
2. Faster Whisper'ın Çince olmayan ASR'sini kullanırken artık önce Çin FunASR modelini indirmeyi gerektirmiyor.
|
103 |
+
3. [PR 610](https://github.com/RVC-Boss/GPT-SoVITS/pull/610) UVR5 yankı giderme modelindeki ayarın tersine çevrildiği sorunu düzeltti.
|
104 |
+
4. [PR 675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675) CUDA mevcut olmadığında Faster Whisper için otomatik CPU çıkarımını etkinleştirdi.
|
105 |
+
5. [PR 573](https://github.com/RVC-Boss/GPT-SoVITS/pull/573) Mac'te doğru CPU çıkarımı sağlamak için `is_half` kontrolünü değiştirdi.
|
106 |
+
|
107 |
+
### 202403/202404/202405 Güncellemeleri
|
108 |
+
|
109 |
+
#### Küçük Düzeltmeler:
|
110 |
+
|
111 |
+
1. Referans metin olmayan mod ile ilgili sorunlar düzeltildi.
|
112 |
+
2. Çince ve İngilizce metin önyüzü optimize edildi.
|
113 |
+
3. API formatı iyileştirildi.
|
114 |
+
4. CMD format sorunları düzeltildi.
|
115 |
+
5. Eğitim verisi işleme sırasında desteklenmeyen diller için hata uyarıları eklendi.
|
116 |
+
6. Hubert çıkarımındaki hata düzeltildi.
|
117 |
+
|
118 |
+
#### Büyük Düzeltmeler:
|
119 |
+
|
120 |
+
1. VQ'yu dondurmadan yapılan SoVITS eğitimi sorunu (bu kalite düşüşüne neden olabilir) düzeltildi.
|
121 |
+
2. Hızlı çıkarım dalı eklendi.
|
122 |
+
|
123 |
+
### 20240610 Güncellemesi
|
124 |
+
|
125 |
+
#### Küçük Düzeltmeler:
|
126 |
+
|
127 |
+
1. [PR 1168](https://github.com/RVC-Boss/GPT-SoVITS/pull/1168) & [PR 1169](https://github.com/RVC-Boss/GPT-SoVITS/pull/1169) saf noktalama işareti ve çoklu noktalama işareti metin girdisi için mantığı geliştirdi.
|
128 |
+
2. [Commit 501a74a](https://github.com/RVC-Boss/GPT-SoVITS/commit/501a74ae96789a26b48932babed5eb4e9483a232) UVR5'teki MDXNet yankı giderme için CMD formatını düzeltti, boşluk içeren yolları destekledi.
|
129 |
+
3. [PR 1159](https://github.com/RVC-Boss/GPT-SoVITS/pull/1159) `s2_train.py` içindeki SoVITS eğitimi için ilerleme çubuğu mantığını düzeltti.
|
130 |
+
|
131 |
+
#### Büyük Düzeltmeler:
|
132 |
+
|
133 |
+
4. [Commit 99f09c8](https://github.com/RVC-Boss/GPT-SoVITS/commit/99f09c8bdc155c1f4272b511940717705509582a) WebUI'nin GPT ince ayarının, Çince giriş metinlerinin BERT özelliğini okumaması sorununu düzeltti, bu da çıkarım ile tutarsızlığa ve potansiyel kalite düşüşüne neden oluyordu.
|
134 |
+
**Dikkat: Daha önce büyük miktarda veri ile ince ayar yaptıysanız, modelin kalitesini artırmak için yeniden ayar yapmanız önerilir.**
|
135 |
+
|
136 |
+
### 20240706 Güncellemesi
|
137 |
+
|
138 |
+
#### Küçük Düzeltmeler:
|
139 |
+
|
140 |
+
1. [Commit 1250670](https://github.com/RVC-Boss/GPT-SoVITS/commit/db50670598f0236613eefa6f2d5a23a271d82041) CPU çıkarımında varsayılan yığın boyutu ondalık sorununu düzeltti.
|
141 |
+
2. [PR 1258](https://github.com/RVC-Boss/GPT-SoVITS/pull/1258), [PR 1265](https://github.com/RVC-Boss/GPT-SoVITS/pull/1265), [PR 1267](https://github.com/RVC-Boss/GPT-SoVITS/pull/1267) gürültü giderme veya ASR ile ilgili istisnalarla karşılaşıldığında bekleyen tüm ses dosyalarının çıkış yapmasına neden olan sorunları düzeltti.
|
142 |
+
3. [PR 1253](https://github.com/RVC-Boss/GPT-SoVITS/pull/1253) noktalama işaretlerine göre ayr��lırken ondalıkların bölünmesi sorununu düzeltti.
|
143 |
+
4. [Commit a208698](https://github.com/RVC-Boss/GPT-SoVITS/commit/a208698e775155efc95b187b746d153d0f2847ca) çoklu GPU eğitimi için çoklu işlem kaydetme mantığını düzeltti.
|
144 |
+
5. [PR 1251](https://github.com/RVC-Boss/GPT-SoVITS/pull/1251) gereksiz `my_utils`'ı kaldırdı.
|
145 |
+
|
146 |
+
#### Büyük Düzeltmeler:
|
147 |
+
|
148 |
+
6. [PR 672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672) hızlandırılmış çıkarım kodu doğrulandı ve ana dala birleştirildi, taban ile tutarlı çıkarım etkileri sağlandı.
|
149 |
+
Ayrıca referans metni olmayan modda hızlandırılmış çıkarımı destekler.
|
150 |
+
|
151 |
+
**Gelecek güncellemeler, `fast_inference` dalındaki değişikliklerin tutarlılığını doğrulamaya devam edecek.**
|
152 |
+
|
153 |
+
### 20240727 Güncellemesi
|
154 |
+
|
155 |
+
#### Küçük Düzeltmeler:
|
156 |
+
|
157 |
+
1. [PR 1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298) gereksiz i18n kodlarını temizledi.
|
158 |
+
2. [PR 1299](https://github.com/RVC-Boss/GPT-SoVITS/pull/1299) kullanıcı dosya yollarındaki sonlandırma eğik çizgilerinin komut satırı hatalarına neden olduğu sorunları düzeltti.
|
159 |
+
3. [PR 756](https://github.com/RVC-Boss/GPT-SoVITS/pull/756) GPT eğitimindeki adım hesaplama mantığını düzeltti.
|
160 |
+
|
161 |
+
#### Büyük Düzeltmeler:
|
162 |
+
|
163 |
+
4. [Commit 9588a3c](https://github.com/RVC-Boss/GPT-SoVITS/commit/9588a3c52d9ebdb20b3c5d74f647d12e7c1171c2) sentez için konuşma hızı ayarlamasını destekledi.
|
164 |
+
Konuşma hızını ayarlarken rastgeleliği dondurmayı etkinleştirdi.
|
165 |
+
|
166 |
+
### 20240806 Güncellemesi
|
167 |
+
|
168 |
+
1. [PR 1306](https://github.com/RVC-Boss/GPT-SoVITS/pull/1306), [PR 1356](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) BS RoFormer vokal eşlik ayırma modelini desteklemeye başladı. [Commit e62e965](https://github.com/RVC-Boss/GPT-SoVITS/commit/e62e965323a60a76a025bcaa45268c1ddcbcf05c) FP16 çıkarımı etkinleştirdi.
|
169 |
+
2. Çince metin ön yüzünü geliştirdi.
|
170 |
+
- [PR 488](https://github.com/RVC-Boss/GPT-SoVITS/pull/488) çoklu heceli karakterler için destek ekledi (v2 sadece);
|
171 |
+
- [PR 987](https://github.com/RVC-Boss/GPT-SoVITS/pull/987) sayı belirleyici ekledi;
|
172 |
+
- [PR 1351](https://github.com/RVC-Boss/GPT-SoVITS/pull/1351) aritmetik ve temel matematik formüllerini destekler;
|
173 |
+
- [PR 1404](https://github.com/RVC-Boss/GPT-SoVITS/pull/1404) karışık metin hatalarını düzeltti.
|
174 |
+
3. [PR 1355](https://github.com/RVC-Boss/GPT-SoVITS/pull/1356) WebUI'de ses işlenirken yolları otomatik olarak doldurdu.
|
175 |
+
4. [Commit bce451a](https://github.com/RVC-Boss/GPT-SoVITS/commit/bce451a2d1641e581e200297d01f219aeaaf7299), [Commit 4c8b761](https://github.com/RVC-Boss/GPT-SoVITS/commit/4c8b7612206536b8b4435997acb69b25d93acb78) GPU tanıma mantığını optimize etti.
|
176 |
+
5. [Commit 8a10147](https://github.com/RVC-Boss/GPT-SoVITS/commit/8a101474b5a4f913b4c94fca2e3ca87d0771bae3) Kantonca ASR desteği ekledi.
|
177 |
+
6. GPT-SoVITS v2 desteği eklendi.
|
178 |
+
7. [PR 1387](https://github.com/RVC-Boss/GPT-SoVITS/pull/1387) zamanlama mantığını optimize etti.
|
179 |
+
|
180 |
+
### 20240821 Güncelleme
|
181 |
+
|
182 |
+
1. [PR 1490](https://github.com/RVC-Boss/GPT-SoVITS/pull/1490) `fast_inference` dalını ana dala birleştir.
|
183 |
+
2. [Issue 1508](https://github.com/RVC-Boss/GPT-SoVITS/issues/1508) SSML etiketlerini kullanarak sayıları, telefon numaralarını, tarihleri ve saatleri optimize etme desteği.
|
184 |
+
3. [PR 1503](https://github.com/RVC-Boss/GPT-SoVITS/pull/1503) API düzeltildi ve optimize edildi.
|
185 |
+
4. [PR 1422](https://github.com/RVC-Boss/GPT-SoVITS/pull/1422) Karıştırmak için yalnızca bir referans sesi yüklenebiliyordu hatası düzeltildi, çeşitli veri seti kontrolleri eklendi ve eksik dosyalar için uyarılar çıkar.
|
186 |
+
|
187 |
+
### 20250211 Güncellemesi
|
188 |
+
|
189 |
+
- [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) GPT-SoVITS v3 Modeli Eklendi, SoVITS v3'ü ince ayar yapmak için 14GB GPU belleği gereklidir.
|
190 |
+
|
191 |
+
### 20250212 Güncellemesi
|
192 |
+
|
193 |
+
- [PR 2040](https://github.com/RVC-Boss/GPT-SoVITS/pull/2040) SoVITS v3'ü ince ayar yapmak için gradyan kontrol noktası ekledi, 12GB GPU belleği gereklidir.
|
194 |
+
|
195 |
+
### 20250214 Güncellemesi
|
196 |
+
|
197 |
+
- [PR 2047](https://github.com/RVC-Boss/GPT-SoVITS/pull/2047) Çok dilli karışık metin segmentasyon stratejisi **A**'yı optimize etti.
|
198 |
+
- `split-lang` bir dil segmentasyon aracı olarak eklendi ve çok dilli karışık metinlerin segmentasyon yeteneklerini iyileştirdi.
|
199 |
+
|
200 |
+
### 20250217 Güncellemesi
|
201 |
+
|
202 |
+
- [PR 2062](https://github.com/RVC-Boss/GPT-SoVITS/pull/2062) Metindeki sayılar ve İngilizceyi işleme mantığını optimize etti.
|
203 |
+
|
204 |
+
### 20250218 Güncellemesi
|
205 |
+
|
206 |
+
- [PR 2073](https://github.com/RVC-Boss/GPT-SoVITS/pull/2073) Çok dilli karışık metin segmentasyon stratejisi **B**'yi optimize etti.
|
207 |
+
|
208 |
+
### 20250223 Güncellemesi
|
209 |
+
|
210 |
+
1. SoVITS V3 için LoRA eğitimi, ince ayar yapmayı destekler. 8GB GPU belleği gereklidir ve sonuçlar tam parametreli ince ayar yapmaktan daha iyidir.
|
211 |
+
2. [PR 2078](https://github.com/RVC-Boss/GPT-SoVITS/pull/2078) Mel Band RoFormer modelini vokal ve enstrüman ayrımı için ekledi.
|
212 |
+
|
213 |
+
### 20250226 Güncellemesi
|
214 |
+
|
215 |
+
1. [PR 2112](https://github.com/RVC-Boss/GPT-SoVITS/pull/2112) Windows'ta İngilizce olmayan dizinlerden kaynaklanan sorunları düzeltti.
|
216 |
+
- Korece için `langsegmenter` kullanımı ile ilgili sorun düzeltildi.
|
217 |
+
2. [PR 2113](https://github.com/RVC-Boss/GPT-SoVITS/pull/2114) Windows'ta İngilizce olmayan dizinlerden kaynaklanan sorunları düzeltti.
|
218 |
+
- Korece/Japonca için `langsegmenter` kullanımı ile ilgili sorun düzeltildi.
|
219 |
+
|
220 |
+
### 20250227 Güncellemesi
|
221 |
+
|
222 |
+
- 24K sesli V3 modeliyle 24K ses oluştururken meydana gelen boğukluk sorununu hafifletmek için, 24K'dan 48K'ya ses süper çözünürlük modelleri eklendi. [Issue 2085](https://github.com/RVC-Boss/GPT-SoVITS/issues/2085), [Issue 2117](https://github.com/RVC-Boss/GPT-SoVITS/issues/2117) de bildirilen sorunlar.
|
docs/tr/README.md
ADDED
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<div align="center">
|
2 |
+
|
3 |
+
<h1>GPT-SoVITS-WebUI</h1>
|
4 |
+
Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüzü.<br><br>
|
5 |
+
|
6 |
+
[](https://github.com/RVC-Boss/GPT-SoVITS)
|
7 |
+
|
8 |
+
<a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
9 |
+
|
10 |
+
<!-- img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br> -->
|
11 |
+
|
12 |
+
[](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
|
13 |
+
[](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
|
14 |
+
[](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
|
15 |
+
[](https://discord.gg/dnrgs5GHfG)
|
16 |
+
|
17 |
+
[**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | **Türkçe**
|
18 |
+
|
19 |
+
</div>
|
20 |
+
|
21 |
+
---
|
22 |
+
|
23 |
+
## Özellikler:
|
24 |
+
|
25 |
+
1. **Sıfır Örnekli Metinden Konuşmaya:** 5 saniyelik bir vokal örneği girin ve anında metinden konuşmaya dönüşümünü deneyimleyin.
|
26 |
+
|
27 |
+
2. **Birkaç Örnekli Metinden Konuşmaya:** Daha iyi ses benzerliği ve gerçekçiliği için modeli yalnızca 1 dakikalık eğitim verisiyle ince ayarlayın.
|
28 |
+
|
29 |
+
3. **Çapraz Dil Desteği:** Eğitim veri setinden farklı dillerde çıkarım, şu anda İngilizce, Japonca, Çince, Kantonca ve Koreceyi destekliyor.
|
30 |
+
|
31 |
+
4. **Web Arayüzü Araçları:** Entegre araçlar arasında vokal eşliğinde ayırma, otomatik eğitim seti segmentasyonu, Çince ASR ve metin etiketleme bulunur ve yeni başlayanların eğitim veri setleri ve GPT/SoVITS modelleri oluşturmalarına yardımcı olur.
|
32 |
+
|
33 |
+
**[Demo videomuzu](https://www.bilibili.com/video/BV12g4y1m7Uw) buradan izleyin!**
|
34 |
+
|
35 |
+
Görünmeyen konuşmacılar birkaç örnekli ince ayar demosu:
|
36 |
+
|
37 |
+
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
|
38 |
+
|
39 |
+
**Kullanıcı Kılavuzu: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
|
40 |
+
|
41 |
+
## Kurulum
|
42 |
+
|
43 |
+
### Test Edilmiş Ortamlar
|
44 |
+
|
45 |
+
| Python Version | PyTorch Version | Device |
|
46 |
+
|----------------|------------------|-----------------|
|
47 |
+
| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 |
|
48 |
+
| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 |
|
49 |
+
| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 |
|
50 |
+
| Python 3.9 | PyTorch 2.5.1 | Apple silicon |
|
51 |
+
| Python 3.11 | PyTorch 2.6.0 | Apple silicon |
|
52 |
+
| Python 3.9 | PyTorch 2.2.2 | CPU |
|
53 |
+
|
54 |
+
### Windows
|
55 |
+
|
56 |
+
Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre paketi indirin](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true) ve _go-webui.bat_ dosyasına çift tıklayarak GPT-SoVITS-WebUI'yi başlatın.
|
57 |
+
|
58 |
+
### Linux
|
59 |
+
|
60 |
+
```bash
|
61 |
+
conda create -n GPTSoVits python=3.9
|
62 |
+
conda activate GPTSoVits
|
63 |
+
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
64 |
+
```
|
65 |
+
|
66 |
+
### macOS
|
67 |
+
|
68 |
+
**Not: Mac'lerde GPU'larla eğitilen modeller, diğer cihazlarda eğitilenlere göre önemli ölçüde daha düşük kalitede sonuç verir, bu nedenle geçici olarak CPU'lar kullanıyoruz.**
|
69 |
+
|
70 |
+
1. `xcode-select --install` komutunu çalıştırarak Xcode komut satırı araçlarını yükleyin.
|
71 |
+
2. Aşağıdaki komutları çalıştırarak programı yükleyin:
|
72 |
+
|
73 |
+
```bash
|
74 |
+
conda create -n GPTSoVits python=3.9
|
75 |
+
conda activate GPTSoVits
|
76 |
+
bash install.sh --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
|
77 |
+
```
|
78 |
+
|
79 |
+
### El ile Yükleme
|
80 |
+
|
81 |
+
#### FFmpeg'i Yükleme
|
82 |
+
|
83 |
+
##### Conda Kullanıcıları
|
84 |
+
|
85 |
+
```bash
|
86 |
+
conda install ffmpeg
|
87 |
+
```
|
88 |
+
|
89 |
+
##### Ubuntu/Debian Kullanıcıları
|
90 |
+
|
91 |
+
```bash
|
92 |
+
sudo apt install ffmpeg
|
93 |
+
sudo apt install libsox-dev
|
94 |
+
conda install -c conda-forge 'ffmpeg<7'
|
95 |
+
```
|
96 |
+
|
97 |
+
##### Windows Kullanıcıları
|
98 |
+
|
99 |
+
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin.
|
100 |
+
|
101 |
+
##### MacOS Kullanıcıları
|
102 |
+
|
103 |
+
```bash
|
104 |
+
brew install ffmpeg
|
105 |
+
```
|
106 |
+
|
107 |
+
#### Bağımlılıkları Yükleme
|
108 |
+
|
109 |
+
```bash
|
110 |
+
pip install -r extra-req.txt --no-deps
|
111 |
+
pip install -r requirements.txt
|
112 |
+
```
|
113 |
+
|
114 |
+
### Docker Kullanarak
|
115 |
+
|
116 |
+
#### docker-compose.yaml yapılandırması
|
117 |
+
|
118 |
+
0. Görüntü etiketleri hakkında: Kod tabanındaki hızlı güncellemeler ve görüntüleri paketleme ve test etme işleminin yavaş olması nedeniyle, lütfen şu anda paketlenmiş en son görüntüleri kontrol etmek için [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(eski sürüm) adresini kontrol edin ve durumunuza göre seçim yapın veya alternatif olarak, kendi ihtiyaçlarınıza göre bir Dockerfile kullanarak yerel olarak oluşturun.
|
119 |
+
1. Ortam Değişkenleri:
|
120 |
+
- is_half: Yarım hassasiyet/çift hassasiyeti kontrol eder. Bu genellikle "SSL çıkarma" adımı sırasında 4-cnhubert/5-wav32k dizinleri altındaki içeriğin doğru şekilde oluşturulmamasının nedenidir. Gerçek durumunuza göre True veya False olarak ayarlayın.
|
121 |
+
2. Birim Yapılandırması, Kapsayıcı içindeki uygulamanın kök dizini /workspace olarak ayarlanmıştır. Varsayılan docker-compose.yaml, içerik yükleme/indirme için bazı pratik örnekler listeler.
|
122 |
+
3. shm_size: Windows üzerinde Docker Desktop için varsayılan kullanılabilir bellek çok küçüktür, bu da anormal işlemlere neden olabilir. Kendi durumunuza göre ayarlayın.
|
123 |
+
4. Dağıtım bölümü altında, GPU ile ilgili ayarlar sisteminize ve gerçek koşullara göre dikkatlice ayarlanmalıdır.
|
124 |
+
|
125 |
+
#### docker compose ile çalıştırma
|
126 |
+
|
127 |
+
```
|
128 |
+
docker compose -f "docker-compose.yaml" up -d
|
129 |
+
```
|
130 |
+
|
131 |
+
#### docker komutu ile çalıştırma
|
132 |
+
|
133 |
+
Yukarıdaki gibi, ilgili parametreleri gerçek durumunuza göre değiştirin, ardından aşağıdaki komutu çalıştırın:
|
134 |
+
|
135 |
+
```
|
136 |
+
docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx
|
137 |
+
```
|
138 |
+
|
139 |
+
## Önceden Eğitilmiş Modeller
|
140 |
+
|
141 |
+
**Eğer `install.sh` başarıyla çalıştırılırsa, No.1,2,3 adımını atlayabilirsiniz.**
|
142 |
+
|
143 |
+
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) üzerinden önceden eğitilmiş modelleri indirip `GPT_SoVITS/pretrained_models` dizinine yerleştirin.
|
144 |
+
|
145 |
+
2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) üzerinden modeli indirip sıkıştırmayı açın ve `G2PWModel` olarak yeniden adlandırın, ardından `GPT_SoVITS/text` dizinine yerleştirin. (Sadece Çince TTS için)
|
146 |
+
|
147 |
+
3. UVR5 (Vokal/Enstrümantal Ayrımı & Yankı Giderme) için, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) üzerinden modelleri indirip `tools/uvr5/uvr5_weights` dizinine yerleştirin.
|
148 |
+
|
149 |
+
- UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **"roformer"** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır.
|
150 |
+
|
151 |
+
- Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir.
|
152 |
+
|
153 |
+
4. Çince ASR için, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) ve [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) üzerinden modelleri indirip `tools/asr/models` dizinine yerleştirin.
|
154 |
+
|
155 |
+
5. İngilizce veya Japonca ASR için, [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) üzerinden modeli indirip `tools/asr/models` dizinine yerleştirin. Ayrıca, [diğer modeller](https://huggingface.co/Systran) benzer bir etki yaratabilir ve daha az disk alanı kaplayabilir.
|
156 |
+
|
157 |
+
## Veri Seti Formatı
|
158 |
+
|
159 |
+
TTS açıklama .list dosya formatı:
|
160 |
+
|
161 |
+
```
|
162 |
+
vocal_path|speaker_name|language|text
|
163 |
+
```
|
164 |
+
|
165 |
+
Dil sözlüğü:
|
166 |
+
|
167 |
+
- 'zh': Çince
|
168 |
+
- 'ja': Japonca
|
169 |
+
- 'en': İngilizce
|
170 |
+
- 'ko': Korece
|
171 |
+
- 'yue': Kantonca
|
172 |
+
|
173 |
+
Örnek:
|
174 |
+
|
175 |
+
```
|
176 |
+
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
|
177 |
+
```
|
178 |
+
|
179 |
+
## İnce Ayar ve Çıkarım
|
180 |
+
|
181 |
+
### WebUI'yi Açın
|
182 |
+
|
183 |
+
#### Entegre Paket Kullanıcıları
|
184 |
+
|
185 |
+
`go-webui.bat` dosyasına çift tıklayın veya `go-webui.ps1` kullanın.
|
186 |
+
V1'e geçmek istiyorsanız, `go-webui-v1.bat` dosyasına çift tıklayın veya `go-webui-v1.ps1` kullanın.
|
187 |
+
|
188 |
+
#### Diğerleri
|
189 |
+
|
190 |
+
```bash
|
191 |
+
python webui.py <dil(isteğe bağlı)>
|
192 |
+
```
|
193 |
+
|
194 |
+
V1'e geçmek istiyorsanız,
|
195 |
+
|
196 |
+
```bash
|
197 |
+
python webui.py v1 <dil(isteğe bağlı)>
|
198 |
+
```
|
199 |
+
|
200 |
+
veya WebUI'de manuel olarak sürüm değiştirin.
|
201 |
+
|
202 |
+
### İnce Ayar
|
203 |
+
|
204 |
+
#### Yol Otomatik Doldurma artık destekleniyor
|
205 |
+
|
206 |
+
1. Ses yolunu doldurun
|
207 |
+
2. Sesi küçük parçalara ayırın
|
208 |
+
3. Gürültü azaltma (isteğe bağlı)
|
209 |
+
4. ASR
|
210 |
+
5. ASR transkripsiyonlarını düzeltin
|
211 |
+
6. Bir sonraki sekmeye geçin ve modeli ince ayar yapın
|
212 |
+
|
213 |
+
### Çıkarım WebUI'sini Açın
|
214 |
+
|
215 |
+
#### Entegre Paket Kullanıcıları
|
216 |
+
|
217 |
+
`go-webui-v2.bat` dosyasına çift tıklayın veya `go-webui-v2.ps1` kullanın, ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
|
218 |
+
|
219 |
+
#### Diğerleri
|
220 |
+
|
221 |
+
```bash
|
222 |
+
python GPT_SoVITS/inference_webui.py <dil(isteğe bağlı)>
|
223 |
+
```
|
224 |
+
|
225 |
+
VEYA
|
226 |
+
|
227 |
+
```bash
|
228 |
+
python webui.py
|
229 |
+
```
|
230 |
+
|
231 |
+
ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
|
232 |
+
|
233 |
+
## V2 Sürüm Notları
|
234 |
+
|
235 |
+
Yeni Özellikler:
|
236 |
+
|
237 |
+
1. Korece ve Kantonca destekler
|
238 |
+
|
239 |
+
2. Optimize edilmiş metin ön yüzü
|
240 |
+
|
241 |
+
3. Önceden eğitilmiş model 2k saatten 5k saate kadar genişletildi
|
242 |
+
|
243 |
+
4. Düşük kaliteli referans sesler için geliştirilmiş sentez kalitesi
|
244 |
+
|
245 |
+
[detaylar burada](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
246 |
+
|
247 |
+
V1 ortamından V2'yi kullanmak için:
|
248 |
+
|
249 |
+
1. `pip install -r requirements.txt` ile bazı paketleri güncelleyin
|
250 |
+
|
251 |
+
2. github'dan en son kodları klonlayın.
|
252 |
+
|
253 |
+
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained` dizinine yerleştirin.
|
254 |
+
|
255 |
+
Ek olarak Çince V2: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.)
|
256 |
+
|
257 |
+
## V3 Sürüm Notları
|
258 |
+
|
259 |
+
### Yeni Özellikler:
|
260 |
+
|
261 |
+
1. **Tını benzerliği** daha yüksek olup, hedef konuşmacıyı yakınsamak için daha az eğitim verisi gerekmektedir (tını benzerliği, base model doğrudan kullanılacak şekilde fine-tuning yapılmadan önemli ölçüde iyileştirilmiştir).
|
262 |
+
|
263 |
+
2. GPT modeli daha **kararlı** hale geldi, tekrarlar ve atlamalar azaldı ve **daha zengin duygusal ifadeler** ile konuşma üretmek daha kolay hale geldi.
|
264 |
+
|
265 |
+
[daha fazla detay](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
266 |
+
|
267 |
+
### v2 ortamında v3 kullanımı:
|
268 |
+
|
269 |
+
1. `pip install -r requirements.txt` ile bazı paketleri güncelleyin.
|
270 |
+
|
271 |
+
2. GitHub'dan en son kodları klonlayın.
|
272 |
+
|
273 |
+
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS\pretrained_models` dizinine yerleştirin.
|
274 |
+
|
275 |
+
ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz.
|
276 |
+
|
277 |
+
## Yapılacaklar Listesi
|
278 |
+
|
279 |
+
- [x] **Yüksek Öncelikli:**
|
280 |
+
|
281 |
+
- [x] Japonca ve İngilizceye yerelleştirme.
|
282 |
+
- [x] Kullanıcı kılavuzu.
|
283 |
+
- [x] Japonca ve İngilizce veri seti ince ayar eğitimi.
|
284 |
+
|
285 |
+
- [ ] **Özellikler:**
|
286 |
+
- [x] Sıfır örnekli ses dönüştürme (5s) / birkaç örnekli ses dönüştürme (1dk).
|
287 |
+
- [x] Metinden konuşmaya konuşma hızı kontrolü.
|
288 |
+
- [ ] ~~Gelişmiş metinden konuşmaya duygu kontrolü.~~
|
289 |
+
- [ ] SoVITS token girdilerini kelime dağarcığı olasılık dağılımına değiştirme denemesi.
|
290 |
+
- [x] İngilizce ve Japonca metin ön ucunu iyileştirme.
|
291 |
+
- [ ] Küçük ve büyük boyutlu metinden konuşmaya modelleri geliştirme.
|
292 |
+
- [x] Colab betikleri.
|
293 |
+
- [ ] Eğitim veri setini genişletmeyi dene (2k saat -> 10k saat).
|
294 |
+
- [x] daha iyi sovits temel modeli (geliştirilmiş ses kalitesi)
|
295 |
+
- [ ] model karışımı
|
296 |
+
|
297 |
+
## (Ekstra) Komut satırından çalıştırma yöntemi
|
298 |
+
|
299 |
+
UVR5 için Web Arayüzünü açmak için komut satırını kullanın
|
300 |
+
|
301 |
+
```
|
302 |
+
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
303 |
+
```
|
304 |
+
|
305 |
+
<!-- Bir tarayıcı açamıyorsanız, UVR işleme için aşağıdaki formatı izleyin,Bu ses işleme için mdxnet kullanıyor
|
306 |
+
```
|
307 |
+
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
308 |
+
``` -->
|
309 |
+
|
310 |
+
Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır
|
311 |
+
|
312 |
+
```
|
313 |
+
python audio_slicer.py \
|
314 |
+
--input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \
|
315 |
+
--output_root "<alt_bölümlere_ayrılmış_ses_kliplerinin_kaydedileceği_dizin>" \
|
316 |
+
--threshold <ses_eşiği> \
|
317 |
+
--min_length <her_bir_alt_klibin_minimum_süresi> \
|
318 |
+
--min_interval <bitişik_alt_klipler_arasındaki_en_kısa_zaman_aralığı>
|
319 |
+
--hop_size <ses_eğrisini_hesaplamak_için_adım_boyutu>
|
320 |
+
```
|
321 |
+
|
322 |
+
Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince)
|
323 |
+
|
324 |
+
```
|
325 |
+
python tools/asr/funasr_asr.py -i <girdi> -o <çıktı>
|
326 |
+
```
|
327 |
+
|
328 |
+
ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışındaki ASR işaretleme)
|
329 |
+
|
330 |
+
(İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir)
|
331 |
+
|
332 |
+
```
|
333 |
+
python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
|
334 |
+
```
|
335 |
+
|
336 |
+
Özel bir liste kaydetme yolu etkinleştirildi
|
337 |
+
|
338 |
+
## Katkı Verenler
|
339 |
+
|
340 |
+
Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz:
|
341 |
+
|
342 |
+
### Teorik Araştırma
|
343 |
+
|
344 |
+
- [ar-vits](https://github.com/innnky/ar-vits)
|
345 |
+
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
346 |
+
- [vits](https://github.com/jaywalnut310/vits)
|
347 |
+
- [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
|
348 |
+
- [contentvec](https://github.com/auspicious3000/contentvec/)
|
349 |
+
- [hifi-gan](https://github.com/jik876/hifi-gan)
|
350 |
+
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
351 |
+
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
352 |
+
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
353 |
+
|
354 |
+
### Önceden Eğitilmiş Modeller
|
355 |
+
|
356 |
+
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
357 |
+
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
358 |
+
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
359 |
+
|
360 |
+
### Tahmin İçin Metin Ön Ucu
|
361 |
+
|
362 |
+
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
363 |
+
- [split-lang](https://github.com/DoodleBears/split-lang)
|
364 |
+
- [g2pW](https://github.com/GitYCC/g2pW)
|
365 |
+
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
366 |
+
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
367 |
+
|
368 |
+
### WebUI Araçları
|
369 |
+
|
370 |
+
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
371 |
+
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
372 |
+
- [SubFix](https://github.com/cronrpc/SubFix)
|
373 |
+
- [FFmpeg](https://github.com/FFmpeg/FFmpeg)
|
374 |
+
- [gradio](https://github.com/gradio-app/gradio)
|
375 |
+
- [faster-whisper](https://github.com/SYSTRAN/faster-whisper)
|
376 |
+
- [FunASR](https://github.com/alibaba-damo-academy/FunASR)
|
377 |
+
- [AP-BWE](https://github.com/yxlu-0102/AP-BWE)
|
378 |
+
|
379 |
+
@Naozumi520'ye Kantonca eğitim setini sağladığı ve Kantonca ile ilgili bilgiler konusunda rehberlik ettiği için minnettarım.
|
380 |
+
|
381 |
+
## Tüm katkıda bulunanlara çabaları için teşekkürler
|
382 |
+
|
383 |
+
<a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
|
384 |
+
<img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
|
385 |
+
</a>
|
tools/AP_BWE_main/24kto48k/readme.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
For the inference of the v3 model, if you find that the generated audio sounds somewhat muffled, you can try using this audio super-resolution model.
|
2 |
+
对于v3模型的推理,如果你发现生成的音频比较闷,可以尝试这个音频超分模型。
|
3 |
+
|
4 |
+
put g_24kto48k.zip and config.json in this folder
|
5 |
+
把g_24kto48k.zip and config.json下到这个文件夹
|
6 |
+
|
7 |
+
download link 下载链接:
|
8 |
+
https://drive.google.com/drive/folders/1IIYTf2zbJWzelu4IftKD6ooHloJ8mnZF?usp=share_link
|
9 |
+
|
10 |
+
audio sr project page 音频超分项目主页:
|
11 |
+
https://github.com/yxlu-0102/AP-BWE
|
tools/AP_BWE_main/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Ye-Xin Lu
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
tools/AP_BWE_main/README.md
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Towards High-Quality and Efficient Speech Bandwidth Extension with Parallel Amplitude and Phase Prediction
|
2 |
+
### Ye-Xin Lu, Yang Ai, Hui-Peng Du, Zhen-Hua Ling
|
3 |
+
|
4 |
+
**Abstract:**
|
5 |
+
Speech bandwidth extension (BWE) refers to widening the frequency bandwidth range of speech signals, enhancing the speech quality towards brighter and fuller.
|
6 |
+
This paper proposes a generative adversarial network (GAN) based BWE model with parallel prediction of Amplitude and Phase spectra, named AP-BWE, which achieves both high-quality and efficient wideband speech waveform generation.
|
7 |
+
The proposed AP-BWE generator is entirely based on convolutional neural networks (CNNs).
|
8 |
+
It features a dual-stream architecture with mutual interaction, where the amplitude stream and the phase stream communicate with each other and respectively extend the high-frequency components from the input narrowband amplitude and phase spectra.
|
9 |
+
To improve the naturalness of the extended speech signals, we employ a multi-period discriminator at the waveform level and design a pair of multi-resolution amplitude and phase discriminators at the spectral level, respectively.
|
10 |
+
Experimental results demonstrate that our proposed AP-BWE achieves state-of-the-art performance in terms of speech quality for BWE tasks targeting sampling rates of both 16 kHz and 48 kHz.
|
11 |
+
In terms of generation efficiency, due to the all-convolutional architecture and all-frame-level operations, the proposed AP-BWE can generate 48 kHz waveform samples 292.3 times faster than real-time on a single RTX 4090 GPU and 18.1 times faster than real-time on a single CPU.
|
12 |
+
Notably, to our knowledge, AP-BWE is the first to achieve the direct extension of the high-frequency phase spectrum, which is beneficial for improving the effectiveness of existing BWE methods.
|
13 |
+
|
14 |
+
**We provide our implementation as open source in this repository. Audio samples can be found at the [demo website](http://yxlu-0102.github.io/AP-BWE).**
|
15 |
+
|
16 |
+
|
17 |
+
## Pre-requisites
|
18 |
+
0. Python >= 3.9.
|
19 |
+
0. Clone this repository.
|
20 |
+
0. Install python requirements. Please refer [requirements.txt](requirements.txt).
|
21 |
+
0. Download datasets
|
22 |
+
1. Download and extract the [VCTK-0.92 dataset](https://datashare.ed.ac.uk/handle/10283/3443), and move its `wav48` directory into [VCTK-Corpus-0.92](VCTK-Corpus-0.92) and rename it as `wav48_origin`.
|
23 |
+
1. Trim the silence of the dataset, and the trimmed files will be saved to `wav48_silence_trimmed`.
|
24 |
+
```
|
25 |
+
cd VCTK-Corpus-0.92
|
26 |
+
python flac2wav.py
|
27 |
+
```
|
28 |
+
1. Move all the trimmed training files from `wav48_silence_trimmed` to [wav48/train](wav48/train) following the indexes in [training.txt](VCTK-Corpus-0.92/training.txt), and move all the untrimmed test files from `wav48_origin` to [wav48/test](wav48/test) following the indexes in [test.txt](VCTK-Corpus-0.92/test.txt).
|
29 |
+
|
30 |
+
## Training
|
31 |
+
```
|
32 |
+
cd train
|
33 |
+
CUDA_VISIBLE_DEVICES=0 python train_16k.py --config [config file path]
|
34 |
+
CUDA_VISIBLE_DEVICES=0 python train_48k.py --config [config file path]
|
35 |
+
```
|
36 |
+
Checkpoints and copies of the configuration file are saved in the `cp_model` directory by default.<br>
|
37 |
+
You can change the path by using the `--checkpoint_path` option.
|
38 |
+
Here is an example:
|
39 |
+
```
|
40 |
+
CUDA_VISIBLE_DEVICES=0 python train_16k.py --config ../configs/config_2kto16k.json --checkpoint_path ../checkpoints/AP-BWE_2kto16k
|
41 |
+
```
|
42 |
+
|
43 |
+
## Inference
|
44 |
+
```
|
45 |
+
cd inference
|
46 |
+
python inference_16k.py --checkpoint_file [generator checkpoint file path]
|
47 |
+
python inference_48k.py --checkpoint_file [generator checkpoint file path]
|
48 |
+
```
|
49 |
+
You can download the [pretrained weights](https://drive.google.com/drive/folders/1IIYTf2zbJWzelu4IftKD6ooHloJ8mnZF?usp=share_link) we provide and move all the files to the `checkpoints` directory.
|
50 |
+
<br>
|
51 |
+
Generated wav files are saved in `generated_files` by default.
|
52 |
+
You can change the path by adding `--output_dir` option.
|
53 |
+
Here is an example:
|
54 |
+
```
|
55 |
+
python inference_16k.py --checkpoint_file ../checkpoints/2kto16k/g_2kto16k --output_dir ../generated_files/2kto16k
|
56 |
+
```
|
57 |
+
|
58 |
+
## Model Structure
|
59 |
+

|
60 |
+
|
61 |
+
## Comparison with other speech BWE methods
|
62 |
+
### 2k/4k/8kHz to 16kHz
|
63 |
+
<p align="center">
|
64 |
+
<img src="Figures/table_16k.png" alt="comparison" width="90%"/>
|
65 |
+
</p>
|
66 |
+
|
67 |
+
### 8k/12k/16/24kHz to 16kHz
|
68 |
+
<p align="center">
|
69 |
+
<img src="Figures/table_48k.png" alt="comparison" width="100%"/>
|
70 |
+
</p>
|
71 |
+
|
72 |
+
## Acknowledgements
|
73 |
+
We referred to [HiFi-GAN](https://github.com/jik876/hifi-gan) and [NSPP](https://github.com/YangAi520/NSPP) to implement this.
|
74 |
+
|
75 |
+
## Citation
|
76 |
+
```
|
77 |
+
@article{lu2024towards,
|
78 |
+
title={Towards high-quality and efficient speech bandwidth extension with parallel amplitude and phase prediction},
|
79 |
+
author={Lu, Ye-Xin and Ai, Yang and Du, Hui-Peng and Ling, Zhen-Hua},
|
80 |
+
journal={arXiv preprint arXiv:2401.06387},
|
81 |
+
year={2024}
|
82 |
+
}
|
83 |
+
|
84 |
+
@inproceedings{lu2024multi,
|
85 |
+
title={Multi-Stage Speech Bandwidth Extension with Flexible Sampling Rate Control},
|
86 |
+
author={Lu, Ye-Xin and Ai, Yang and Sheng, Zheng-Yan and Ling, Zhen-Hua},
|
87 |
+
booktitle={Proc. Interspeech},
|
88 |
+
pages={2270--2274},
|
89 |
+
year={2024}
|
90 |
+
}
|
91 |
+
```
|
tools/AP_BWE_main/datasets1/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
tools/AP_BWE_main/datasets1/dataset.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import random
|
3 |
+
import torch
|
4 |
+
import torchaudio
|
5 |
+
import torch.utils.data
|
6 |
+
import torchaudio.functional as aF
|
7 |
+
|
8 |
+
|
9 |
+
def amp_pha_stft(audio, n_fft, hop_size, win_size, center=True):
|
10 |
+
hann_window = torch.hann_window(win_size).to(audio.device)
|
11 |
+
stft_spec = torch.stft(
|
12 |
+
audio,
|
13 |
+
n_fft,
|
14 |
+
hop_length=hop_size,
|
15 |
+
win_length=win_size,
|
16 |
+
window=hann_window,
|
17 |
+
center=center,
|
18 |
+
pad_mode="reflect",
|
19 |
+
normalized=False,
|
20 |
+
return_complex=True,
|
21 |
+
)
|
22 |
+
log_amp = torch.log(torch.abs(stft_spec) + 1e-4)
|
23 |
+
pha = torch.angle(stft_spec)
|
24 |
+
|
25 |
+
com = torch.stack((torch.exp(log_amp) * torch.cos(pha), torch.exp(log_amp) * torch.sin(pha)), dim=-1)
|
26 |
+
|
27 |
+
return log_amp, pha, com
|
28 |
+
|
29 |
+
|
30 |
+
def amp_pha_istft(log_amp, pha, n_fft, hop_size, win_size, center=True):
|
31 |
+
amp = torch.exp(log_amp)
|
32 |
+
com = torch.complex(amp * torch.cos(pha), amp * torch.sin(pha))
|
33 |
+
hann_window = torch.hann_window(win_size).to(com.device)
|
34 |
+
audio = torch.istft(com, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window, center=center)
|
35 |
+
|
36 |
+
return audio
|
37 |
+
|
38 |
+
|
39 |
+
def get_dataset_filelist(a):
|
40 |
+
with open(a.input_training_file, "r", encoding="utf-8") as fi:
|
41 |
+
training_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0]
|
42 |
+
|
43 |
+
with open(a.input_validation_file, "r", encoding="utf-8") as fi:
|
44 |
+
validation_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0]
|
45 |
+
|
46 |
+
return training_indexes, validation_indexes
|
47 |
+
|
48 |
+
|
49 |
+
class Dataset(torch.utils.data.Dataset):
|
50 |
+
def __init__(
|
51 |
+
self,
|
52 |
+
training_indexes,
|
53 |
+
wavs_dir,
|
54 |
+
segment_size,
|
55 |
+
hr_sampling_rate,
|
56 |
+
lr_sampling_rate,
|
57 |
+
split=True,
|
58 |
+
shuffle=True,
|
59 |
+
n_cache_reuse=1,
|
60 |
+
device=None,
|
61 |
+
):
|
62 |
+
self.audio_indexes = training_indexes
|
63 |
+
random.seed(1234)
|
64 |
+
if shuffle:
|
65 |
+
random.shuffle(self.audio_indexes)
|
66 |
+
self.wavs_dir = wavs_dir
|
67 |
+
self.segment_size = segment_size
|
68 |
+
self.hr_sampling_rate = hr_sampling_rate
|
69 |
+
self.lr_sampling_rate = lr_sampling_rate
|
70 |
+
self.split = split
|
71 |
+
self.cached_wav = None
|
72 |
+
self.n_cache_reuse = n_cache_reuse
|
73 |
+
self._cache_ref_count = 0
|
74 |
+
self.device = device
|
75 |
+
|
76 |
+
def __getitem__(self, index):
|
77 |
+
filename = self.audio_indexes[index]
|
78 |
+
if self._cache_ref_count == 0:
|
79 |
+
audio, orig_sampling_rate = torchaudio.load(os.path.join(self.wavs_dir, filename + ".wav"))
|
80 |
+
self.cached_wav = audio
|
81 |
+
self._cache_ref_count = self.n_cache_reuse
|
82 |
+
else:
|
83 |
+
audio = self.cached_wav
|
84 |
+
self._cache_ref_count -= 1
|
85 |
+
|
86 |
+
if orig_sampling_rate == self.hr_sampling_rate:
|
87 |
+
audio_hr = audio
|
88 |
+
else:
|
89 |
+
audio_hr = aF.resample(audio, orig_freq=orig_sampling_rate, new_freq=self.hr_sampling_rate)
|
90 |
+
|
91 |
+
audio_lr = aF.resample(audio, orig_freq=orig_sampling_rate, new_freq=self.lr_sampling_rate)
|
92 |
+
audio_lr = aF.resample(audio_lr, orig_freq=self.lr_sampling_rate, new_freq=self.hr_sampling_rate)
|
93 |
+
audio_lr = audio_lr[:, : audio_hr.size(1)]
|
94 |
+
|
95 |
+
if self.split:
|
96 |
+
if audio_hr.size(1) >= self.segment_size:
|
97 |
+
max_audio_start = audio_hr.size(1) - self.segment_size
|
98 |
+
audio_start = random.randint(0, max_audio_start)
|
99 |
+
audio_hr = audio_hr[:, audio_start : audio_start + self.segment_size]
|
100 |
+
audio_lr = audio_lr[:, audio_start : audio_start + self.segment_size]
|
101 |
+
else:
|
102 |
+
audio_hr = torch.nn.functional.pad(audio_hr, (0, self.segment_size - audio_hr.size(1)), "constant")
|
103 |
+
audio_lr = torch.nn.functional.pad(audio_lr, (0, self.segment_size - audio_lr.size(1)), "constant")
|
104 |
+
|
105 |
+
return (audio_hr.squeeze(), audio_lr.squeeze())
|
106 |
+
|
107 |
+
def __len__(self):
|
108 |
+
return len(self.audio_indexes)
|
tools/AP_BWE_main/models/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
tools/AP_BWE_main/models/model.py
ADDED
@@ -0,0 +1,464 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
import torch.nn as nn
|
4 |
+
from torch.nn.utils import weight_norm, spectral_norm
|
5 |
+
|
6 |
+
|
7 |
+
# from utils import init_weights, get_padding
|
8 |
+
def get_padding(kernel_size, dilation=1):
|
9 |
+
return int((kernel_size * dilation - dilation) / 2)
|
10 |
+
|
11 |
+
|
12 |
+
def init_weights(m, mean=0.0, std=0.01):
|
13 |
+
classname = m.__class__.__name__
|
14 |
+
if classname.find("Conv") != -1:
|
15 |
+
m.weight.data.normal_(mean, std)
|
16 |
+
|
17 |
+
|
18 |
+
import numpy as np
|
19 |
+
from typing import Tuple, List
|
20 |
+
|
21 |
+
LRELU_SLOPE = 0.1
|
22 |
+
|
23 |
+
|
24 |
+
class ConvNeXtBlock(nn.Module):
|
25 |
+
"""ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
dim (int): Number of input channels.
|
29 |
+
intermediate_dim (int): Dimensionality of the intermediate layer.
|
30 |
+
layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
|
31 |
+
Defaults to None.
|
32 |
+
adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
|
33 |
+
None means non-conditional LayerNorm. Defaults to None.
|
34 |
+
"""
|
35 |
+
|
36 |
+
def __init__(
|
37 |
+
self,
|
38 |
+
dim: int,
|
39 |
+
layer_scale_init_value=None,
|
40 |
+
adanorm_num_embeddings=None,
|
41 |
+
):
|
42 |
+
super().__init__()
|
43 |
+
self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
|
44 |
+
self.adanorm = adanorm_num_embeddings is not None
|
45 |
+
|
46 |
+
self.norm = nn.LayerNorm(dim, eps=1e-6)
|
47 |
+
self.pwconv1 = nn.Linear(dim, dim * 3) # pointwise/1x1 convs, implemented with linear layers
|
48 |
+
self.act = nn.GELU()
|
49 |
+
self.pwconv2 = nn.Linear(dim * 3, dim)
|
50 |
+
self.gamma = (
|
51 |
+
nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
|
52 |
+
if layer_scale_init_value > 0
|
53 |
+
else None
|
54 |
+
)
|
55 |
+
|
56 |
+
def forward(self, x, cond_embedding_id=None):
|
57 |
+
residual = x
|
58 |
+
x = self.dwconv(x)
|
59 |
+
x = x.transpose(1, 2) # (B, C, T) -> (B, T, C)
|
60 |
+
if self.adanorm:
|
61 |
+
assert cond_embedding_id is not None
|
62 |
+
x = self.norm(x, cond_embedding_id)
|
63 |
+
else:
|
64 |
+
x = self.norm(x)
|
65 |
+
x = self.pwconv1(x)
|
66 |
+
x = self.act(x)
|
67 |
+
x = self.pwconv2(x)
|
68 |
+
if self.gamma is not None:
|
69 |
+
x = self.gamma * x
|
70 |
+
x = x.transpose(1, 2) # (B, T, C) -> (B, C, T)
|
71 |
+
|
72 |
+
x = residual + x
|
73 |
+
return x
|
74 |
+
|
75 |
+
|
76 |
+
class APNet_BWE_Model(torch.nn.Module):
|
77 |
+
def __init__(self, h):
|
78 |
+
super(APNet_BWE_Model, self).__init__()
|
79 |
+
self.h = h
|
80 |
+
self.adanorm_num_embeddings = None
|
81 |
+
layer_scale_init_value = 1 / h.ConvNeXt_layers
|
82 |
+
|
83 |
+
self.conv_pre_mag = nn.Conv1d(h.n_fft // 2 + 1, h.ConvNeXt_channels, 7, 1, padding=get_padding(7, 1))
|
84 |
+
self.norm_pre_mag = nn.LayerNorm(h.ConvNeXt_channels, eps=1e-6)
|
85 |
+
self.conv_pre_pha = nn.Conv1d(h.n_fft // 2 + 1, h.ConvNeXt_channels, 7, 1, padding=get_padding(7, 1))
|
86 |
+
self.norm_pre_pha = nn.LayerNorm(h.ConvNeXt_channels, eps=1e-6)
|
87 |
+
|
88 |
+
self.convnext_mag = nn.ModuleList(
|
89 |
+
[
|
90 |
+
ConvNeXtBlock(
|
91 |
+
dim=h.ConvNeXt_channels,
|
92 |
+
layer_scale_init_value=layer_scale_init_value,
|
93 |
+
adanorm_num_embeddings=self.adanorm_num_embeddings,
|
94 |
+
)
|
95 |
+
for _ in range(h.ConvNeXt_layers)
|
96 |
+
]
|
97 |
+
)
|
98 |
+
|
99 |
+
self.convnext_pha = nn.ModuleList(
|
100 |
+
[
|
101 |
+
ConvNeXtBlock(
|
102 |
+
dim=h.ConvNeXt_channels,
|
103 |
+
layer_scale_init_value=layer_scale_init_value,
|
104 |
+
adanorm_num_embeddings=self.adanorm_num_embeddings,
|
105 |
+
)
|
106 |
+
for _ in range(h.ConvNeXt_layers)
|
107 |
+
]
|
108 |
+
)
|
109 |
+
|
110 |
+
self.norm_post_mag = nn.LayerNorm(h.ConvNeXt_channels, eps=1e-6)
|
111 |
+
self.norm_post_pha = nn.LayerNorm(h.ConvNeXt_channels, eps=1e-6)
|
112 |
+
self.apply(self._init_weights)
|
113 |
+
self.linear_post_mag = nn.Linear(h.ConvNeXt_channels, h.n_fft // 2 + 1)
|
114 |
+
self.linear_post_pha_r = nn.Linear(h.ConvNeXt_channels, h.n_fft // 2 + 1)
|
115 |
+
self.linear_post_pha_i = nn.Linear(h.ConvNeXt_channels, h.n_fft // 2 + 1)
|
116 |
+
|
117 |
+
def _init_weights(self, m):
|
118 |
+
if isinstance(m, (nn.Conv1d, nn.Linear)):
|
119 |
+
nn.init.trunc_normal_(m.weight, std=0.02)
|
120 |
+
nn.init.constant_(m.bias, 0)
|
121 |
+
|
122 |
+
def forward(self, mag_nb, pha_nb):
|
123 |
+
x_mag = self.conv_pre_mag(mag_nb)
|
124 |
+
x_pha = self.conv_pre_pha(pha_nb)
|
125 |
+
x_mag = self.norm_pre_mag(x_mag.transpose(1, 2)).transpose(1, 2)
|
126 |
+
x_pha = self.norm_pre_pha(x_pha.transpose(1, 2)).transpose(1, 2)
|
127 |
+
|
128 |
+
for conv_block_mag, conv_block_pha in zip(self.convnext_mag, self.convnext_pha):
|
129 |
+
x_mag = x_mag + x_pha
|
130 |
+
x_pha = x_pha + x_mag
|
131 |
+
x_mag = conv_block_mag(x_mag, cond_embedding_id=None)
|
132 |
+
x_pha = conv_block_pha(x_pha, cond_embedding_id=None)
|
133 |
+
|
134 |
+
x_mag = self.norm_post_mag(x_mag.transpose(1, 2))
|
135 |
+
mag_wb = mag_nb + self.linear_post_mag(x_mag).transpose(1, 2)
|
136 |
+
|
137 |
+
x_pha = self.norm_post_pha(x_pha.transpose(1, 2))
|
138 |
+
x_pha_r = self.linear_post_pha_r(x_pha)
|
139 |
+
x_pha_i = self.linear_post_pha_i(x_pha)
|
140 |
+
pha_wb = torch.atan2(x_pha_i, x_pha_r).transpose(1, 2)
|
141 |
+
|
142 |
+
com_wb = torch.stack((torch.exp(mag_wb) * torch.cos(pha_wb), torch.exp(mag_wb) * torch.sin(pha_wb)), dim=-1)
|
143 |
+
|
144 |
+
return mag_wb, pha_wb, com_wb
|
145 |
+
|
146 |
+
|
147 |
+
class DiscriminatorP(torch.nn.Module):
|
148 |
+
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
149 |
+
super(DiscriminatorP, self).__init__()
|
150 |
+
self.period = period
|
151 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
152 |
+
self.convs = nn.ModuleList(
|
153 |
+
[
|
154 |
+
norm_f(nn.Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
155 |
+
norm_f(nn.Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
156 |
+
norm_f(nn.Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
157 |
+
norm_f(nn.Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
158 |
+
norm_f(nn.Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
|
159 |
+
]
|
160 |
+
)
|
161 |
+
self.conv_post = norm_f(nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
162 |
+
|
163 |
+
def forward(self, x):
|
164 |
+
fmap = []
|
165 |
+
|
166 |
+
# 1d to 2d
|
167 |
+
b, c, t = x.shape
|
168 |
+
if t % self.period != 0: # pad first
|
169 |
+
n_pad = self.period - (t % self.period)
|
170 |
+
x = F.pad(x, (0, n_pad), "reflect")
|
171 |
+
t = t + n_pad
|
172 |
+
x = x.view(b, c, t // self.period, self.period)
|
173 |
+
|
174 |
+
for i, l in enumerate(self.convs):
|
175 |
+
x = l(x)
|
176 |
+
x = F.leaky_relu(x, LRELU_SLOPE)
|
177 |
+
if i > 0:
|
178 |
+
fmap.append(x)
|
179 |
+
x = self.conv_post(x)
|
180 |
+
fmap.append(x)
|
181 |
+
x = torch.flatten(x, 1, -1)
|
182 |
+
|
183 |
+
return x, fmap
|
184 |
+
|
185 |
+
|
186 |
+
class MultiPeriodDiscriminator(torch.nn.Module):
|
187 |
+
def __init__(self):
|
188 |
+
super(MultiPeriodDiscriminator, self).__init__()
|
189 |
+
self.discriminators = nn.ModuleList(
|
190 |
+
[
|
191 |
+
DiscriminatorP(2),
|
192 |
+
DiscriminatorP(3),
|
193 |
+
DiscriminatorP(5),
|
194 |
+
DiscriminatorP(7),
|
195 |
+
DiscriminatorP(11),
|
196 |
+
]
|
197 |
+
)
|
198 |
+
|
199 |
+
def forward(self, y, y_hat):
|
200 |
+
y_d_rs = []
|
201 |
+
y_d_gs = []
|
202 |
+
fmap_rs = []
|
203 |
+
fmap_gs = []
|
204 |
+
for i, d in enumerate(self.discriminators):
|
205 |
+
y_d_r, fmap_r = d(y)
|
206 |
+
y_d_g, fmap_g = d(y_hat)
|
207 |
+
y_d_rs.append(y_d_r)
|
208 |
+
fmap_rs.append(fmap_r)
|
209 |
+
y_d_gs.append(y_d_g)
|
210 |
+
fmap_gs.append(fmap_g)
|
211 |
+
|
212 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
213 |
+
|
214 |
+
|
215 |
+
class MultiResolutionAmplitudeDiscriminator(nn.Module):
|
216 |
+
def __init__(
|
217 |
+
self,
|
218 |
+
resolutions: Tuple[Tuple[int, int, int]] = ((512, 128, 512), (1024, 256, 1024), (2048, 512, 2048)),
|
219 |
+
num_embeddings: int = None,
|
220 |
+
):
|
221 |
+
super().__init__()
|
222 |
+
self.discriminators = nn.ModuleList(
|
223 |
+
[DiscriminatorAR(resolution=r, num_embeddings=num_embeddings) for r in resolutions]
|
224 |
+
)
|
225 |
+
|
226 |
+
def forward(
|
227 |
+
self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
|
228 |
+
) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
|
229 |
+
y_d_rs = []
|
230 |
+
y_d_gs = []
|
231 |
+
fmap_rs = []
|
232 |
+
fmap_gs = []
|
233 |
+
|
234 |
+
for d in self.discriminators:
|
235 |
+
y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
|
236 |
+
y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
|
237 |
+
y_d_rs.append(y_d_r)
|
238 |
+
fmap_rs.append(fmap_r)
|
239 |
+
y_d_gs.append(y_d_g)
|
240 |
+
fmap_gs.append(fmap_g)
|
241 |
+
|
242 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
243 |
+
|
244 |
+
|
245 |
+
class DiscriminatorAR(nn.Module):
|
246 |
+
def __init__(
|
247 |
+
self,
|
248 |
+
resolution: Tuple[int, int, int],
|
249 |
+
channels: int = 64,
|
250 |
+
in_channels: int = 1,
|
251 |
+
num_embeddings: int = None,
|
252 |
+
):
|
253 |
+
super().__init__()
|
254 |
+
self.resolution = resolution
|
255 |
+
self.in_channels = in_channels
|
256 |
+
self.convs = nn.ModuleList(
|
257 |
+
[
|
258 |
+
weight_norm(nn.Conv2d(in_channels, channels, kernel_size=(7, 5), stride=(2, 2), padding=(3, 2))),
|
259 |
+
weight_norm(nn.Conv2d(channels, channels, kernel_size=(5, 3), stride=(2, 1), padding=(2, 1))),
|
260 |
+
weight_norm(nn.Conv2d(channels, channels, kernel_size=(5, 3), stride=(2, 2), padding=(2, 1))),
|
261 |
+
weight_norm(nn.Conv2d(channels, channels, kernel_size=3, stride=(2, 1), padding=1)),
|
262 |
+
weight_norm(nn.Conv2d(channels, channels, kernel_size=3, stride=(2, 2), padding=1)),
|
263 |
+
]
|
264 |
+
)
|
265 |
+
if num_embeddings is not None:
|
266 |
+
self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=channels)
|
267 |
+
torch.nn.init.zeros_(self.emb.weight)
|
268 |
+
self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), padding=(1, 1)))
|
269 |
+
|
270 |
+
def forward(
|
271 |
+
self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
|
272 |
+
) -> Tuple[torch.Tensor, List[torch.Tensor]]:
|
273 |
+
fmap = []
|
274 |
+
x = x.squeeze(1)
|
275 |
+
|
276 |
+
x = self.spectrogram(x)
|
277 |
+
x = x.unsqueeze(1)
|
278 |
+
for l in self.convs:
|
279 |
+
x = l(x)
|
280 |
+
x = F.leaky_relu(x, LRELU_SLOPE)
|
281 |
+
fmap.append(x)
|
282 |
+
if cond_embedding_id is not None:
|
283 |
+
emb = self.emb(cond_embedding_id)
|
284 |
+
h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
|
285 |
+
else:
|
286 |
+
h = 0
|
287 |
+
x = self.conv_post(x)
|
288 |
+
fmap.append(x)
|
289 |
+
x += h
|
290 |
+
x = torch.flatten(x, 1, -1)
|
291 |
+
|
292 |
+
return x, fmap
|
293 |
+
|
294 |
+
def spectrogram(self, x: torch.Tensor) -> torch.Tensor:
|
295 |
+
n_fft, hop_length, win_length = self.resolution
|
296 |
+
amplitude_spectrogram = torch.stft(
|
297 |
+
x,
|
298 |
+
n_fft=n_fft,
|
299 |
+
hop_length=hop_length,
|
300 |
+
win_length=win_length,
|
301 |
+
window=None, # interestingly rectangular window kind of works here
|
302 |
+
center=True,
|
303 |
+
return_complex=True,
|
304 |
+
).abs()
|
305 |
+
|
306 |
+
return amplitude_spectrogram
|
307 |
+
|
308 |
+
|
309 |
+
class MultiResolutionPhaseDiscriminator(nn.Module):
|
310 |
+
def __init__(
|
311 |
+
self,
|
312 |
+
resolutions: Tuple[Tuple[int, int, int]] = ((512, 128, 512), (1024, 256, 1024), (2048, 512, 2048)),
|
313 |
+
num_embeddings: int = None,
|
314 |
+
):
|
315 |
+
super().__init__()
|
316 |
+
self.discriminators = nn.ModuleList(
|
317 |
+
[DiscriminatorPR(resolution=r, num_embeddings=num_embeddings) for r in resolutions]
|
318 |
+
)
|
319 |
+
|
320 |
+
def forward(
|
321 |
+
self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
|
322 |
+
) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
|
323 |
+
y_d_rs = []
|
324 |
+
y_d_gs = []
|
325 |
+
fmap_rs = []
|
326 |
+
fmap_gs = []
|
327 |
+
|
328 |
+
for d in self.discriminators:
|
329 |
+
y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
|
330 |
+
y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
|
331 |
+
y_d_rs.append(y_d_r)
|
332 |
+
fmap_rs.append(fmap_r)
|
333 |
+
y_d_gs.append(y_d_g)
|
334 |
+
fmap_gs.append(fmap_g)
|
335 |
+
|
336 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
337 |
+
|
338 |
+
|
339 |
+
class DiscriminatorPR(nn.Module):
|
340 |
+
def __init__(
|
341 |
+
self,
|
342 |
+
resolution: Tuple[int, int, int],
|
343 |
+
channels: int = 64,
|
344 |
+
in_channels: int = 1,
|
345 |
+
num_embeddings: int = None,
|
346 |
+
):
|
347 |
+
super().__init__()
|
348 |
+
self.resolution = resolution
|
349 |
+
self.in_channels = in_channels
|
350 |
+
self.convs = nn.ModuleList(
|
351 |
+
[
|
352 |
+
weight_norm(nn.Conv2d(in_channels, channels, kernel_size=(7, 5), stride=(2, 2), padding=(3, 2))),
|
353 |
+
weight_norm(nn.Conv2d(channels, channels, kernel_size=(5, 3), stride=(2, 1), padding=(2, 1))),
|
354 |
+
weight_norm(nn.Conv2d(channels, channels, kernel_size=(5, 3), stride=(2, 2), padding=(2, 1))),
|
355 |
+
weight_norm(nn.Conv2d(channels, channels, kernel_size=3, stride=(2, 1), padding=1)),
|
356 |
+
weight_norm(nn.Conv2d(channels, channels, kernel_size=3, stride=(2, 2), padding=1)),
|
357 |
+
]
|
358 |
+
)
|
359 |
+
if num_embeddings is not None:
|
360 |
+
self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=channels)
|
361 |
+
torch.nn.init.zeros_(self.emb.weight)
|
362 |
+
self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), padding=(1, 1)))
|
363 |
+
|
364 |
+
def forward(
|
365 |
+
self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
|
366 |
+
) -> Tuple[torch.Tensor, List[torch.Tensor]]:
|
367 |
+
fmap = []
|
368 |
+
x = x.squeeze(1)
|
369 |
+
|
370 |
+
x = self.spectrogram(x)
|
371 |
+
x = x.unsqueeze(1)
|
372 |
+
for l in self.convs:
|
373 |
+
x = l(x)
|
374 |
+
x = F.leaky_relu(x, LRELU_SLOPE)
|
375 |
+
fmap.append(x)
|
376 |
+
if cond_embedding_id is not None:
|
377 |
+
emb = self.emb(cond_embedding_id)
|
378 |
+
h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
|
379 |
+
else:
|
380 |
+
h = 0
|
381 |
+
x = self.conv_post(x)
|
382 |
+
fmap.append(x)
|
383 |
+
x += h
|
384 |
+
x = torch.flatten(x, 1, -1)
|
385 |
+
|
386 |
+
return x, fmap
|
387 |
+
|
388 |
+
def spectrogram(self, x: torch.Tensor) -> torch.Tensor:
|
389 |
+
n_fft, hop_length, win_length = self.resolution
|
390 |
+
phase_spectrogram = torch.stft(
|
391 |
+
x,
|
392 |
+
n_fft=n_fft,
|
393 |
+
hop_length=hop_length,
|
394 |
+
win_length=win_length,
|
395 |
+
window=None, # interestingly rectangular window kind of works here
|
396 |
+
center=True,
|
397 |
+
return_complex=True,
|
398 |
+
).angle()
|
399 |
+
|
400 |
+
return phase_spectrogram
|
401 |
+
|
402 |
+
|
403 |
+
def feature_loss(fmap_r, fmap_g):
|
404 |
+
loss = 0
|
405 |
+
for dr, dg in zip(fmap_r, fmap_g):
|
406 |
+
for rl, gl in zip(dr, dg):
|
407 |
+
loss += torch.mean(torch.abs(rl - gl))
|
408 |
+
|
409 |
+
return loss
|
410 |
+
|
411 |
+
|
412 |
+
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
413 |
+
loss = 0
|
414 |
+
r_losses = []
|
415 |
+
g_losses = []
|
416 |
+
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
417 |
+
r_loss = torch.mean(torch.clamp(1 - dr, min=0))
|
418 |
+
g_loss = torch.mean(torch.clamp(1 + dg, min=0))
|
419 |
+
loss += r_loss + g_loss
|
420 |
+
r_losses.append(r_loss.item())
|
421 |
+
g_losses.append(g_loss.item())
|
422 |
+
|
423 |
+
return loss, r_losses, g_losses
|
424 |
+
|
425 |
+
|
426 |
+
def generator_loss(disc_outputs):
|
427 |
+
loss = 0
|
428 |
+
gen_losses = []
|
429 |
+
for dg in disc_outputs:
|
430 |
+
l = torch.mean(torch.clamp(1 - dg, min=0))
|
431 |
+
gen_losses.append(l)
|
432 |
+
loss += l
|
433 |
+
|
434 |
+
return loss, gen_losses
|
435 |
+
|
436 |
+
|
437 |
+
def phase_losses(phase_r, phase_g):
|
438 |
+
ip_loss = torch.mean(anti_wrapping_function(phase_r - phase_g))
|
439 |
+
gd_loss = torch.mean(anti_wrapping_function(torch.diff(phase_r, dim=1) - torch.diff(phase_g, dim=1)))
|
440 |
+
iaf_loss = torch.mean(anti_wrapping_function(torch.diff(phase_r, dim=2) - torch.diff(phase_g, dim=2)))
|
441 |
+
|
442 |
+
return ip_loss, gd_loss, iaf_loss
|
443 |
+
|
444 |
+
|
445 |
+
def anti_wrapping_function(x):
|
446 |
+
return torch.abs(x - torch.round(x / (2 * np.pi)) * 2 * np.pi)
|
447 |
+
|
448 |
+
|
449 |
+
def stft_mag(audio, n_fft=2048, hop_length=512):
|
450 |
+
hann_window = torch.hann_window(n_fft).to(audio.device)
|
451 |
+
stft_spec = torch.stft(audio, n_fft, hop_length, window=hann_window, return_complex=True)
|
452 |
+
stft_mag = torch.abs(stft_spec)
|
453 |
+
return stft_mag
|
454 |
+
|
455 |
+
|
456 |
+
def cal_snr(pred, target):
|
457 |
+
snr = (20 * torch.log10(torch.norm(target, dim=-1) / torch.norm(pred - target, dim=-1).clamp(min=1e-8))).mean()
|
458 |
+
return snr
|
459 |
+
|
460 |
+
|
461 |
+
def cal_lsd(pred, target):
|
462 |
+
sp = torch.log10(stft_mag(pred).square().clamp(1e-8))
|
463 |
+
st = torch.log10(stft_mag(target).square().clamp(1e-8))
|
464 |
+
return (sp - st).square().mean(dim=1).sqrt().mean()
|
tools/__init__.py
ADDED
File without changes
|
tools/asr/config.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
|
4 |
+
def check_fw_local_models():
|
5 |
+
"""
|
6 |
+
启动时检查本地是否有 Faster Whisper 模型.
|
7 |
+
"""
|
8 |
+
model_size_list = [
|
9 |
+
"tiny",
|
10 |
+
"tiny.en",
|
11 |
+
"base",
|
12 |
+
"base.en",
|
13 |
+
"small",
|
14 |
+
"small.en",
|
15 |
+
"medium",
|
16 |
+
"medium.en",
|
17 |
+
"large",
|
18 |
+
"large-v1",
|
19 |
+
"large-v2",
|
20 |
+
"large-v3",
|
21 |
+
]
|
22 |
+
for i, size in enumerate(model_size_list):
|
23 |
+
if os.path.exists(f"tools/asr/models/faster-whisper-{size}"):
|
24 |
+
model_size_list[i] = size + "-local"
|
25 |
+
return model_size_list
|
26 |
+
|
27 |
+
|
28 |
+
asr_dict = {
|
29 |
+
"达摩 ASR (中文)": {"lang": ["zh", "yue"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]},
|
30 |
+
"Faster Whisper (多语种)": {
|
31 |
+
"lang": ["auto", "zh", "en", "ja", "ko", "yue"],
|
32 |
+
"size": check_fw_local_models(),
|
33 |
+
"path": "fasterwhisper_asr.py",
|
34 |
+
"precision": ["float32", "float16", "int8"],
|
35 |
+
},
|
36 |
+
}
|
tools/asr/fasterwhisper_asr.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import os
|
3 |
+
import traceback
|
4 |
+
|
5 |
+
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
6 |
+
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
7 |
+
|
8 |
+
import torch
|
9 |
+
from faster_whisper import WhisperModel
|
10 |
+
from tqdm import tqdm
|
11 |
+
|
12 |
+
from tools.asr.config import check_fw_local_models
|
13 |
+
|
14 |
+
# fmt: off
|
15 |
+
language_code_list = [
|
16 |
+
"af", "am", "ar", "as", "az",
|
17 |
+
"ba", "be", "bg", "bn", "bo",
|
18 |
+
"br", "bs", "ca", "cs", "cy",
|
19 |
+
"da", "de", "el", "en", "es",
|
20 |
+
"et", "eu", "fa", "fi", "fo",
|
21 |
+
"fr", "gl", "gu", "ha", "haw",
|
22 |
+
"he", "hi", "hr", "ht", "hu",
|
23 |
+
"hy", "id", "is", "it", "ja",
|
24 |
+
"jw", "ka", "kk", "km", "kn",
|
25 |
+
"ko", "la", "lb", "ln", "lo",
|
26 |
+
"lt", "lv", "mg", "mi", "mk",
|
27 |
+
"ml", "mn", "mr", "ms", "mt",
|
28 |
+
"my", "ne", "nl", "nn", "no",
|
29 |
+
"oc", "pa", "pl", "ps", "pt",
|
30 |
+
"ro", "ru", "sa", "sd", "si",
|
31 |
+
"sk", "sl", "sn", "so", "sq",
|
32 |
+
"sr", "su", "sv", "sw", "ta",
|
33 |
+
"te", "tg", "th", "tk", "tl",
|
34 |
+
"tr", "tt", "uk", "ur", "uz",
|
35 |
+
"vi", "yi", "yo", "zh", "yue",
|
36 |
+
"auto"]
|
37 |
+
# fmt: on
|
38 |
+
|
39 |
+
|
40 |
+
def execute_asr(input_folder, output_folder, model_size, language, precision):
|
41 |
+
if "-local" in model_size:
|
42 |
+
model_size = model_size[:-6]
|
43 |
+
model_path = f"tools/asr/models/faster-whisper-{model_size}"
|
44 |
+
else:
|
45 |
+
model_path = model_size
|
46 |
+
if language == "auto":
|
47 |
+
language = None # 不设置语种由模型自动输出概率最高的语种
|
48 |
+
print("loading faster whisper model:", model_size, model_path)
|
49 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
50 |
+
try:
|
51 |
+
model = WhisperModel(model_path, device=device, compute_type=precision)
|
52 |
+
except:
|
53 |
+
return print(traceback.format_exc())
|
54 |
+
|
55 |
+
input_file_names = os.listdir(input_folder)
|
56 |
+
input_file_names.sort()
|
57 |
+
|
58 |
+
output = []
|
59 |
+
output_file_name = os.path.basename(input_folder)
|
60 |
+
|
61 |
+
for file_name in tqdm(input_file_names):
|
62 |
+
try:
|
63 |
+
file_path = os.path.join(input_folder, file_name)
|
64 |
+
segments, info = model.transcribe(
|
65 |
+
audio=file_path,
|
66 |
+
beam_size=5,
|
67 |
+
vad_filter=True,
|
68 |
+
vad_parameters=dict(min_silence_duration_ms=700),
|
69 |
+
language=language,
|
70 |
+
)
|
71 |
+
text = ""
|
72 |
+
|
73 |
+
if info.language == "zh":
|
74 |
+
print("检测为中文文本, 转 FunASR 处理")
|
75 |
+
if "only_asr" not in globals():
|
76 |
+
from tools.asr.funasr_asr import only_asr # 如果用英文就不需要导入下载模型
|
77 |
+
text = only_asr(file_path, language=info.language.lower())
|
78 |
+
|
79 |
+
if text == "":
|
80 |
+
for segment in segments:
|
81 |
+
text += segment.text
|
82 |
+
output.append(f"{file_path}|{output_file_name}|{info.language.upper()}|{text}")
|
83 |
+
except:
|
84 |
+
print(traceback.format_exc())
|
85 |
+
|
86 |
+
output_folder = output_folder or "output/asr_opt"
|
87 |
+
os.makedirs(output_folder, exist_ok=True)
|
88 |
+
output_file_path = os.path.abspath(f"{output_folder}/{output_file_name}.list")
|
89 |
+
|
90 |
+
with open(output_file_path, "w", encoding="utf-8") as f:
|
91 |
+
f.write("\n".join(output))
|
92 |
+
print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
|
93 |
+
return output_file_path
|
94 |
+
|
95 |
+
|
96 |
+
if __name__ == "__main__":
|
97 |
+
parser = argparse.ArgumentParser()
|
98 |
+
parser.add_argument(
|
99 |
+
"-i", "--input_folder", type=str, required=True, help="Path to the folder containing WAV files."
|
100 |
+
)
|
101 |
+
parser.add_argument("-o", "--output_folder", type=str, required=True, help="Output folder to store transcriptions.")
|
102 |
+
parser.add_argument(
|
103 |
+
"-s",
|
104 |
+
"--model_size",
|
105 |
+
type=str,
|
106 |
+
default="large-v3",
|
107 |
+
choices=check_fw_local_models(),
|
108 |
+
help="Model Size of Faster Whisper",
|
109 |
+
)
|
110 |
+
parser.add_argument(
|
111 |
+
"-l", "--language", type=str, default="ja", choices=language_code_list, help="Language of the audio files."
|
112 |
+
)
|
113 |
+
parser.add_argument(
|
114 |
+
"-p",
|
115 |
+
"--precision",
|
116 |
+
type=str,
|
117 |
+
default="float16",
|
118 |
+
choices=["float16", "float32", "int8"],
|
119 |
+
help="fp16, int8 or fp32",
|
120 |
+
)
|
121 |
+
|
122 |
+
cmd = parser.parse_args()
|
123 |
+
output_file_path = execute_asr(
|
124 |
+
input_folder=cmd.input_folder,
|
125 |
+
output_folder=cmd.output_folder,
|
126 |
+
model_size=cmd.model_size,
|
127 |
+
language=cmd.language,
|
128 |
+
precision=cmd.precision,
|
129 |
+
)
|
tools/asr/funasr_asr.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding:utf-8 -*-
|
2 |
+
|
3 |
+
import argparse
|
4 |
+
import os
|
5 |
+
import traceback
|
6 |
+
|
7 |
+
# from funasr.utils import version_checker
|
8 |
+
# version_checker.check_for_update = lambda: None
|
9 |
+
from funasr import AutoModel
|
10 |
+
from tqdm import tqdm
|
11 |
+
|
12 |
+
funasr_models = {} # 存储模型避免重复加载
|
13 |
+
|
14 |
+
|
15 |
+
def only_asr(input_file, language):
|
16 |
+
try:
|
17 |
+
model = create_model(language)
|
18 |
+
text = model.generate(input=input_file)[0]["text"]
|
19 |
+
except:
|
20 |
+
text = ""
|
21 |
+
print(traceback.format_exc())
|
22 |
+
return text
|
23 |
+
|
24 |
+
|
25 |
+
def create_model(language="zh"):
|
26 |
+
path_vad = "tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
27 |
+
path_punc = "tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
|
28 |
+
path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
29 |
+
path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
|
30 |
+
vad_model_revision = punc_model_revision = "v2.0.4"
|
31 |
+
|
32 |
+
if language == "zh":
|
33 |
+
path_asr = "tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
34 |
+
path_asr = (
|
35 |
+
path_asr
|
36 |
+
if os.path.exists(path_asr)
|
37 |
+
else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
38 |
+
)
|
39 |
+
model_revision = "v2.0.4"
|
40 |
+
elif language == "yue":
|
41 |
+
path_asr = "tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
|
42 |
+
path_asr = (
|
43 |
+
path_asr
|
44 |
+
if os.path.exists(path_asr)
|
45 |
+
else "iic/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
|
46 |
+
)
|
47 |
+
model_revision = "master"
|
48 |
+
path_vad = path_punc = None
|
49 |
+
vad_model_revision = punc_model_revision = None
|
50 |
+
###友情提示:粤语带VAD识别可能会有少量shape不对报错的,但是不带VAD可以.不带vad只能分阶段单独加标点。不过标点模型对粤语效果真的不行…
|
51 |
+
else:
|
52 |
+
raise ValueError("FunASR 不支持该语言" + ": " + language)
|
53 |
+
|
54 |
+
if language in funasr_models:
|
55 |
+
return funasr_models[language]
|
56 |
+
else:
|
57 |
+
model = AutoModel(
|
58 |
+
model=path_asr,
|
59 |
+
model_revision=model_revision,
|
60 |
+
vad_model=path_vad,
|
61 |
+
vad_model_revision=vad_model_revision,
|
62 |
+
punc_model=path_punc,
|
63 |
+
punc_model_revision=punc_model_revision,
|
64 |
+
)
|
65 |
+
print(f"FunASR 模型加载完成: {language.upper()}")
|
66 |
+
|
67 |
+
funasr_models[language] = model
|
68 |
+
return model
|
69 |
+
|
70 |
+
|
71 |
+
def execute_asr(input_folder, output_folder, model_size, language):
|
72 |
+
input_file_names = os.listdir(input_folder)
|
73 |
+
input_file_names.sort()
|
74 |
+
|
75 |
+
output = []
|
76 |
+
output_file_name = os.path.basename(input_folder)
|
77 |
+
|
78 |
+
model = create_model(language)
|
79 |
+
|
80 |
+
for file_name in tqdm(input_file_names):
|
81 |
+
try:
|
82 |
+
print("\n" + file_name)
|
83 |
+
file_path = os.path.join(input_folder, file_name)
|
84 |
+
text = model.generate(input=file_path)[0]["text"]
|
85 |
+
output.append(f"{file_path}|{output_file_name}|{language.upper()}|{text}")
|
86 |
+
except:
|
87 |
+
print(traceback.format_exc())
|
88 |
+
|
89 |
+
output_folder = output_folder or "output/asr_opt"
|
90 |
+
os.makedirs(output_folder, exist_ok=True)
|
91 |
+
output_file_path = os.path.abspath(f"{output_folder}/{output_file_name}.list")
|
92 |
+
|
93 |
+
with open(output_file_path, "w", encoding="utf-8") as f:
|
94 |
+
f.write("\n".join(output))
|
95 |
+
print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
|
96 |
+
return output_file_path
|
97 |
+
|
98 |
+
|
99 |
+
if __name__ == "__main__":
|
100 |
+
parser = argparse.ArgumentParser()
|
101 |
+
parser.add_argument(
|
102 |
+
"-i", "--input_folder", type=str, required=True, help="Path to the folder containing WAV files."
|
103 |
+
)
|
104 |
+
parser.add_argument("-o", "--output_folder", type=str, required=True, help="Output folder to store transcriptions.")
|
105 |
+
parser.add_argument("-s", "--model_size", type=str, default="large", help="Model Size of FunASR is Large")
|
106 |
+
parser.add_argument(
|
107 |
+
"-l", "--language", type=str, default="zh", choices=["zh", "yue", "auto"], help="Language of the audio files."
|
108 |
+
)
|
109 |
+
parser.add_argument(
|
110 |
+
"-p", "--precision", type=str, default="float16", choices=["float16", "float32"], help="fp16 or fp32"
|
111 |
+
) # 还没接入
|
112 |
+
cmd = parser.parse_args()
|
113 |
+
execute_asr(
|
114 |
+
input_folder=cmd.input_folder,
|
115 |
+
output_folder=cmd.output_folder,
|
116 |
+
model_size=cmd.model_size,
|
117 |
+
language=cmd.language,
|
118 |
+
)
|
tools/asr/models/.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
*
|
2 |
+
!.gitignore
|
tools/audio_sr.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import absolute_import, division, print_function, unicode_literals
|
2 |
+
import sys
|
3 |
+
import os
|
4 |
+
|
5 |
+
AP_BWE_main_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "AP_BWE_main")
|
6 |
+
sys.path.append(AP_BWE_main_dir_path)
|
7 |
+
import json
|
8 |
+
import torch
|
9 |
+
import torchaudio.functional as aF
|
10 |
+
# from attrdict import AttrDict####will be bug in py3.10
|
11 |
+
|
12 |
+
from datasets1.dataset import amp_pha_stft, amp_pha_istft
|
13 |
+
from models.model import APNet_BWE_Model
|
14 |
+
|
15 |
+
|
16 |
+
class AP_BWE:
|
17 |
+
def __init__(self, device, DictToAttrRecursive, checkpoint_file=None):
|
18 |
+
if checkpoint_file == None:
|
19 |
+
checkpoint_file = "%s/24kto48k/g_24kto48k.zip" % (AP_BWE_main_dir_path)
|
20 |
+
if os.path.exists(checkpoint_file) == False:
|
21 |
+
raise FileNotFoundError
|
22 |
+
config_file = os.path.join(os.path.split(checkpoint_file)[0], "config.json")
|
23 |
+
with open(config_file) as f:
|
24 |
+
data = f.read()
|
25 |
+
json_config = json.loads(data)
|
26 |
+
# h = AttrDict(json_config)
|
27 |
+
h = DictToAttrRecursive(json_config)
|
28 |
+
model = APNet_BWE_Model(h).to(device)
|
29 |
+
state_dict = torch.load(checkpoint_file, map_location="cpu", weights_only=False)
|
30 |
+
model.load_state_dict(state_dict["generator"])
|
31 |
+
model.eval()
|
32 |
+
self.device = device
|
33 |
+
self.model = model
|
34 |
+
self.h = h
|
35 |
+
|
36 |
+
def to(self, *arg, **kwargs):
|
37 |
+
self.model.to(*arg, **kwargs)
|
38 |
+
self.device = self.model.conv_pre_mag.weight.device
|
39 |
+
return self
|
40 |
+
|
41 |
+
def __call__(self, audio, orig_sampling_rate):
|
42 |
+
with torch.no_grad():
|
43 |
+
# audio, orig_sampling_rate = torchaudio.load(inp_path)
|
44 |
+
# audio = audio.to(self.device)
|
45 |
+
audio = aF.resample(audio, orig_freq=orig_sampling_rate, new_freq=self.h.hr_sampling_rate)
|
46 |
+
amp_nb, pha_nb, com_nb = amp_pha_stft(audio, self.h.n_fft, self.h.hop_size, self.h.win_size)
|
47 |
+
amp_wb_g, pha_wb_g, com_wb_g = self.model(amp_nb, pha_nb)
|
48 |
+
audio_hr_g = amp_pha_istft(amp_wb_g, pha_wb_g, self.h.n_fft, self.h.hop_size, self.h.win_size)
|
49 |
+
# sf.write(opt_path, audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate, 'PCM_16')
|
50 |
+
return audio_hr_g.squeeze().cpu().numpy(), self.h.hr_sampling_rate
|
tools/cmd-denoise.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
import traceback
|
4 |
+
|
5 |
+
from modelscope.pipelines import pipeline
|
6 |
+
from modelscope.utils.constant import Tasks
|
7 |
+
from tqdm import tqdm
|
8 |
+
|
9 |
+
path_denoise = "tools/denoise-model/speech_frcrn_ans_cirm_16k"
|
10 |
+
path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
|
11 |
+
ans = pipeline(Tasks.acoustic_noise_suppression, model=path_denoise)
|
12 |
+
|
13 |
+
|
14 |
+
def execute_denoise(input_folder, output_folder):
|
15 |
+
os.makedirs(output_folder, exist_ok=True)
|
16 |
+
# print(input_folder)
|
17 |
+
# print(list(os.listdir(input_folder).sort()))
|
18 |
+
for name in tqdm(os.listdir(input_folder)):
|
19 |
+
try:
|
20 |
+
ans("%s/%s" % (input_folder, name), output_path="%s/%s" % (output_folder, name))
|
21 |
+
except:
|
22 |
+
traceback.print_exc()
|
23 |
+
|
24 |
+
|
25 |
+
if __name__ == "__main__":
|
26 |
+
parser = argparse.ArgumentParser()
|
27 |
+
parser.add_argument(
|
28 |
+
"-i", "--input_folder", type=str, required=True, help="Path to the folder containing WAV files."
|
29 |
+
)
|
30 |
+
parser.add_argument("-o", "--output_folder", type=str, required=True, help="Output folder to store transcriptions.")
|
31 |
+
parser.add_argument(
|
32 |
+
"-p", "--precision", type=str, default="float16", choices=["float16", "float32"], help="fp16 or fp32"
|
33 |
+
) # 还没接入
|
34 |
+
cmd = parser.parse_args()
|
35 |
+
execute_denoise(
|
36 |
+
input_folder=cmd.input_folder,
|
37 |
+
output_folder=cmd.output_folder,
|
38 |
+
)
|
tools/denoise-model/.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
*
|
2 |
+
!.gitignore
|
tools/i18n/i18n.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import locale
|
3 |
+
import os
|
4 |
+
|
5 |
+
I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale")
|
6 |
+
|
7 |
+
|
8 |
+
def load_language_list(language):
|
9 |
+
with open(os.path.join(I18N_JSON_DIR, f"{language}.json"), "r", encoding="utf-8") as f:
|
10 |
+
language_list = json.load(f)
|
11 |
+
return language_list
|
12 |
+
|
13 |
+
|
14 |
+
def scan_language_list():
|
15 |
+
language_list = []
|
16 |
+
for name in os.listdir(I18N_JSON_DIR):
|
17 |
+
if name.endswith(".json"):
|
18 |
+
language_list.append(name.split(".")[0])
|
19 |
+
return language_list
|
20 |
+
|
21 |
+
|
22 |
+
class I18nAuto:
|
23 |
+
def __init__(self, language=None):
|
24 |
+
if language in ["Auto", None]:
|
25 |
+
language = locale.getdefaultlocale()[0]
|
26 |
+
# getlocale can't identify the system's language ((None, None))
|
27 |
+
if not os.path.exists(os.path.join(I18N_JSON_DIR, f"{language}.json")):
|
28 |
+
language = "en_US"
|
29 |
+
self.language = language
|
30 |
+
self.language_map = load_language_list(language)
|
31 |
+
|
32 |
+
def __call__(self, key):
|
33 |
+
return self.language_map.get(key, key)
|
34 |
+
|
35 |
+
def __repr__(self):
|
36 |
+
return "Use Language: " + self.language
|
37 |
+
|
38 |
+
|
39 |
+
if __name__ == "__main__":
|
40 |
+
i18n = I18nAuto(language="en_US")
|
41 |
+
print(i18n)
|
tools/i18n/locale/en_US.json
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net(onnx_dereverb): Best choice for dual-channel reverberation, cannot remove single-channel reverberation;",
|
3 |
+
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho: Removes delay effects. Aggressive mode removes more thoroughly than Normal mode. DeReverb additionally removes reverberation, can remove mono reverberation, but does not clean heavily high-frequency plate reverberation.",
|
4 |
+
"*实验/模型名": "*Experiment/model name",
|
5 |
+
"*文本标注文件": "*Text labelling file",
|
6 |
+
"*训练集音频文件目录": "*Audio dataset folder",
|
7 |
+
"*请上传并填写参考信息": "*Please upload and fill reference information",
|
8 |
+
"*请填写需要合成的目标文本和语种模式": "*Please fill in the target text and language mode for synthesis",
|
9 |
+
".限制范围越小判别效果越好。": "Less Multilingual is better",
|
10 |
+
"1-GPT-SoVITS-TTS": "1-GPT-SOVITS-TTS",
|
11 |
+
"1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;": "1. The DeEcho-DeReverb model's processing time is nearly twice that of the other two DeEcho models.",
|
12 |
+
"1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;": "1. Preserve Vocals: Choose this option for audio without harmonies, as it better retains the main vocal compared to the HP5 model. This option includes two built-in models, HP2 and HP3. HP3 may slightly let through some accompaniment but retains the main vocal slightly better than HP2.",
|
13 |
+
"2-GPT-SoVITS-变声": "2-GPT-SoVITS-Voice Changer",
|
14 |
+
"2、MDX-Net-Dereverb模型挺慢的;": "2、MDX-Net-Dereverb Model is slow;",
|
15 |
+
"2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;": "2. Keep Only Main Vocal: Choose this option for audio with harmonies, as it may slightly reduce the main vocal. Includes one built-in HP5 model;",
|
16 |
+
"3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "3. Personal Recommendation for the cleanest configuration: First use MDX-Net followed by DeEcho-Aggressive",
|
17 |
+
"3、去混响、去延迟模型(by FoxJoy):": "3. Reverberation and delay removal model(by FoxJoy):",
|
18 |
+
"ASR 模型": "ASR model",
|
19 |
+
"ASR 模型尺寸": "ASR model size",
|
20 |
+
"ASR 语言设置": "ASR language",
|
21 |
+
"GPT 训练: 模型权重文件在 GPT_weights/": "GPT Training: Model Weights saved in GPT_weights/",
|
22 |
+
"GPT模型列表": "GPT weight list",
|
23 |
+
"GPT训练": "GPT Training",
|
24 |
+
"GPT采样参数(无参考文本时不要太低。不懂就用默认):": "GPT sampling parameters (not too low when there's no reference text. Use default if unsure):",
|
25 |
+
"GPU卡号,只能填1个整数": "GPU number, can only input ONE integer",
|
26 |
+
"GPU卡号以-分割,每个卡号一个进程": "GPU number is separated by -, each GPU will run one process ",
|
27 |
+
"LoRA秩": "LoRA Rank",
|
28 |
+
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "Missing Pretrained SoVITS V3 Model, Cannot Load LoRA Weights",
|
29 |
+
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "SoVITS Training: Model Weights saved in SoVITS_weights/",
|
30 |
+
"SoVITS模型列表": "SoVITS weight list",
|
31 |
+
"SoVITS训练": "SoVITS Training",
|
32 |
+
"TTS推理WebUI": "TTS Inference WebUI",
|
33 |
+
"UVR5人声伴奏分离&去混响去延迟工具": "UVR5 WebUI (Vocal Separation/Deecho/Dereverb)",
|
34 |
+
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proportion of normalized audio merged into dataset",
|
35 |
+
"batch_size": "Batch Size",
|
36 |
+
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: FO hop size, the smaller the value, the higher the accuracy)",
|
37 |
+
"max:归一化后最大值多少": "Loudness multiplier after normalized",
|
38 |
+
"max_sil_kept:切完后静音最多留多长": "Maximum length for silence to be kept",
|
39 |
+
"min_interval:最短切割间隔": "Minumum interval for audio cutting",
|
40 |
+
"min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length: the minimum length of each segment. If the first segment is too short, it will be concatenated with the next segment until it exceeds this value",
|
41 |
+
"temperature": "temperature",
|
42 |
+
"threshold:音量小于这个值视作静音的备选切割点": "Noise gate threshold (loudness below this value will be treated as noise",
|
43 |
+
"top_k": "top_k",
|
44 |
+
"top_p": "top_p",
|
45 |
+
"v3暂不支持该模式,使用了会报错。": "v3 does not support this mode currently, using it will cause an error.",
|
46 |
+
"v3输出如果觉得闷可以试试开超分": "For V3 model, if generated audio sounds somewhat muffled, try enable audio super-resolution.",
|
47 |
+
"不切": "No slice",
|
48 |
+
"中文": "Chinese",
|
49 |
+
"中文教程文档": "Chinese Tutorial",
|
50 |
+
"中英混合": "Chinese-English Mixed",
|
51 |
+
"主参考音频(请上传3~10秒内参考音频,超过会报错!)": "Primary Reference Audio (Please upload reference audio within 3-10 seconds, exceeding this limit will cause an error!)",
|
52 |
+
"主参考音频的文本": "Text of Primary Reference Audio",
|
53 |
+
"主参考音频的语种": "Language of Primary Reference Audio",
|
54 |
+
"也可批量输入音频文件, 二选一, 优先读文件夹": "Multiple audio files can also be imported. If a folder path exists, this input is ignored.",
|
55 |
+
"人声伴奏分离批量处理, 使用UVR5模型。": "Batch processing for vocal and instrumental separation, using the UVR5 model.",
|
56 |
+
"人声分离WebUI": "Vocal Separation WebUI",
|
57 |
+
"人声提取激进程度": "Vocal extraction aggressiveness",
|
58 |
+
"以下文件或文件夹不存在": "No Such File or Folder",
|
59 |
+
"以下模型不存在:": "No Such Model:",
|
60 |
+
"伴奏人声分离&去混响&去回声": "Vocals/Accompaniment Separation & Reverberation Removal",
|
61 |
+
"你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载好": "Super-Resolution Model Not Found. Please follow the tutorial to download the model file if you want to use it.",
|
62 |
+
"使用无参考文本模式时建议使用微调的GPT": "Recommended to use a Finetune-GPT when using Prompt-Free Mode.",
|
63 |
+
"保存频率save_every_epoch": "Save frequency (save_every_epoch):",
|
64 |
+
"保持随机": "Keep Random",
|
65 |
+
"关闭": "Close ",
|
66 |
+
"凑50字一切": "Slice per 50 characters",
|
67 |
+
"凑四句一切": "Slice once every 4 sentences",
|
68 |
+
"分桶处理模式已关闭": "Bucket Processing Mode Disabled",
|
69 |
+
"分桶处理模式已开启": "Bucket Processing Mode Enabled",
|
70 |
+
"分段返回模式不支持分桶处理,已自动关闭分桶处理": "Segmented Return Mode does not support Bucket Processing, Bucket Processing Disabled automatically",
|
71 |
+
"分段返回模式已开启": "Segmented Return Mode Enabled",
|
72 |
+
"分段间隔(秒)": "Segment Interval (Seconds)",
|
73 |
+
"分段间隔过小,已自动设置为0.01": "Segment Interval too short, automatically set to 0.01",
|
74 |
+
"切分": "Segmentation",
|
75 |
+
"切分后文本": "Inference Text After Segmentation",
|
76 |
+
"切分后的子音频的输出根目录": "Audio slicer output folder",
|
77 |
+
"切分文本": "Segment Text",
|
78 |
+
"切割使用的进程数": "CPU threads used for audio slicing",
|
79 |
+
"刷新模型路径": "refreshing model paths",
|
80 |
+
"前端处理后的文本(每句):": "Processed text from the frontend (per sentence):",
|
81 |
+
"前置数据集获取工具": "Fetch Datasets",
|
82 |
+
"占用中": " Occupying",
|
83 |
+
"去混响/去延迟,附:": "Dereverberation/Delay Removal, including:",
|
84 |
+
"参考音频在3~10秒范围外,请更换!": "Reference audio is outside the 3-10 second range, please choose another one!",
|
85 |
+
"参考音频的文本": "Text for reference audio",
|
86 |
+
"参考音频的语种": "Language for reference audio",
|
87 |
+
"句间停顿秒数": "Pause Duration between Sentences (Seconds)",
|
88 |
+
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Optional: Upload multiple reference audio files by dragging and dropping them (recommended to be of the same gender), and average their tone. If this option is left blank, the tone will be controlled by the single reference audio on the left. If fine-tuning the model, it is recommended that all reference audio files have tones within the fine-tuning training set; the pretrained model can be ignored.",
|
89 |
+
"合成语音": "Start inference",
|
90 |
+
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "An example of a valid folder path format: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (simply copy the address from the file manager's address bar).",
|
91 |
+
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Support for Phoneme Conversion, Manual Phoneme Editing, and Step-by-Step Speech Synthesis will be added in the future.",
|
92 |
+
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "If reference audio is not clear or unsure what to write, enable this option to ignore the reference text.",
|
93 |
+
"启用并行推理版本": "Enable Parallel Inference Version",
|
94 |
+
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。": "Please fill in the segmented audio files' directory! The full path of the audio file = the directory concatenated with the filename corresponding to the waveform in the list file (not the full path). If left blank, the absolute full path in the .list file will be used.",
|
95 |
+
"多语种混合": "Multilingual Mixed",
|
96 |
+
"多语种混合(粤语)": "Multilingual Mixed(Yue)",
|
97 |
+
"失败": " Failed",
|
98 |
+
"如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "If you do not agree with this clause, you cannot use or reference any codes and files within the software package. See the root directory Agreement-LICENSE for details.",
|
99 |
+
"实际输入的参考文本:": "Actual Input Reference Text:",
|
100 |
+
"实际输入的目标文本(切句后):": "Actual Input Target Text (after sentence segmentation):",
|
101 |
+
"实际输入的目标文本(每句):": "Actual Input Target Text (per sentence):",
|
102 |
+
"实际输入的目标文本:": "Actual Input Target Text:",
|
103 |
+
"导出文件格式": "Export file format",
|
104 |
+
"已关闭": " is Closed",
|
105 |
+
"已完成": " Finished",
|
106 |
+
"已开启": " is Opened",
|
107 |
+
"并行推理": "Parallel Inference",
|
108 |
+
"并行推理模式已关闭": "Parallel Inference Mode Disabled",
|
109 |
+
"并行推理模式已开启": "Parallel Inference Mode Enabled",
|
110 |
+
"开启": "Open ",
|
111 |
+
"开启无参考文本模式。不填参考文本亦相当于开启。": "Enable no reference mode. If you don't fill 'Text for reference audio', no reference mode will be enabled.",
|
112 |
+
"微调训练": "Fine-Tuning",
|
113 |
+
"怎么切": "How to slice the sentence",
|
114 |
+
"总训练轮数total_epoch": "Total training epochs (total_epoch):",
|
115 |
+
"总训练轮数total_epoch,不建议太高": "Total epochs, do not increase to a value that is too high",
|
116 |
+
"指定输出主人声文件夹": "Specify the output folder for vocals:",
|
117 |
+
"指定输出非主人声文件夹": "Specify the output folder for accompaniment:",
|
118 |
+
"按中文句号。切": "Slice by Chinese punct",
|
119 |
+
"按标点符号切": "Slice by every punct",
|
120 |
+
"按英文句号.切": "Slice by English punct",
|
121 |
+
"推理": "Inference",
|
122 |
+
"推理设置": "Inference Settings",
|
123 |
+
"提取文本Bert特征": "Extract Text BERT Features",
|
124 |
+
"数据分桶(并行推理时会降低一点计算量)": "Data Bucketing (Reduces Computation Cost in Parallel Inference)",
|
125 |
+
"数据类型精度": "Computing precision",
|
126 |
+
"文本分词与特征提取": "Tokenization & BERT Feature Extraction",
|
127 |
+
"文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "Text Segmentation Tool. Very long text may not yield good synthesis results, so Segmentation is Recommended. Synthesis will be performed based on line breaks and then concatenated.",
|
128 |
+
"文本模块学习率权重": "Text model learning rate weighting",
|
129 |
+
"施工中,请静候佳音": "In construction, please wait",
|
130 |
+
"日文": "Japanese",
|
131 |
+
"日英混合": "Japanese-English Mixed",
|
132 |
+
"是否仅保存最新的权重文件以节省硬盘空间": "Save only the latest weight file to save disk space",
|
133 |
+
"是否在每次保存时间点将最终小模型保存至weights文件夹": "Save a small final model to the 'weights' folder at each save point:",
|
134 |
+
"是否开启DPO训练选项(实验性)": "Enable DPO Training (Experimental)",
|
135 |
+
"是否直接对上次合成结果调整语速和音色。防止随机性。": "Adjust the speech rate and tone of the last synthesis result to prevent randomness.",
|
136 |
+
"显卡信息": "GPU Information",
|
137 |
+
"未下载模型": "Model Not Downloaded",
|
138 |
+
"本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.": "This software is open source under the MIT license. The author does not have any control over the software. Users who use the software and distribute the sounds exported by the software are solely responsible.",
|
139 |
+
"标注文件路径 (含文件后缀 *.list)": "Label File Path (with file extension *.list)",
|
140 |
+
"模型": "Model",
|
141 |
+
"模型分为三类:": "Models are categorized into three types:",
|
142 |
+
"模型切换": "Model switch",
|
143 |
+
"每张显卡的batch_size": "Batch size per GPU:",
|
144 |
+
"版本": "Version",
|
145 |
+
"粤英混合": "Yue-English Mixed",
|
146 |
+
"粤语": "Yue",
|
147 |
+
"终止合成": "Terminate Synthesis",
|
148 |
+
"缺少Hubert数据集": "Missing Hubert Dataset",
|
149 |
+
"缺少语义数据集": "Missing Semantics Dataset",
|
150 |
+
"缺少音素数据集": "Missing Phoneme Dataset",
|
151 |
+
"缺少音频数据集": "Missing Audio Dataset",
|
152 |
+
"英文": "English",
|
153 |
+
"训练集格式化一键三连": "Training Set One-Click Formatting",
|
154 |
+
"训练集格式化工具": "Dataset Formatting Tool",
|
155 |
+
"语义Token提取": "Semantics Token Extraction",
|
156 |
+
"语速": "Speech rate",
|
157 |
+
"语速调整,高为更快": "Adjust speech rate, higher for faster",
|
158 |
+
"语速调节不支持分桶处理,已自动关闭分桶处理": "Speech Rate Adjustment does not support Bucket Processing, Bucket Processing Disabled automatically",
|
159 |
+
"语音切分": "Speech Slicing",
|
160 |
+
"语音切分工具": "Speech Slicing Tool",
|
161 |
+
"语音文本校对标注工具": "Speech-to-Text Proofreading Tool",
|
162 |
+
"语音自监督特征提取": "Speech SSL Feature Extraction",
|
163 |
+
"语音识别": "Speech Recognition",
|
164 |
+
"语音识别工具": "Speech Recognition Tool",
|
165 |
+
"语音降噪": "Speech Denoising",
|
166 |
+
"语音降噪工具": "Speech Denoising Tool",
|
167 |
+
"请上传3~10秒内参考音频,超过会报错!": "Please upload a reference audio within the 3-10 second range; if it exceeds this duration, it will raise errors.",
|
168 |
+
"请上传参考音频": "Please Upload the Reference Audio",
|
169 |
+
"请填入推理文本": "Please Fill in the Terget Text",
|
170 |
+
"请填入正确的List路径": "Please Fill in the Correct List Path",
|
171 |
+
"请填入正确的音频文件夹路径": "Please Fill in the Correct Audio Folder Path",
|
172 |
+
"请输入有效文本": "Please enter valid text.",
|
173 |
+
"路径不存在,使用默认配置": "Paths Not Found, Using Default Config",
|
174 |
+
"路径不能为空": "Expected No Empty Path",
|
175 |
+
"路径错误": "Path Error",
|
176 |
+
"转换": "Convert",
|
177 |
+
"辅参考音频(可选多个,或不选)": "Secondary Reference Audio (Multiple Optional, or None)",
|
178 |
+
"输入待处理音频文件夹路径": "Enter the path of the audio folder to be processed:",
|
179 |
+
"输入文件夹路径": "Input folder path",
|
180 |
+
"输入路径不存在": "Input Path Not Found",
|
181 |
+
"输入路径存在但不可用": "Input Path Exists but Unavailable",
|
182 |
+
"输出logs/实验名目录下应有23456开头的文件和文件夹": "output folder (logs/{experiment name}) should have files and folders starts with 23456.",
|
183 |
+
"输出信息": "Output information",
|
184 |
+
"输出文件夹路径": "Output folder path",
|
185 |
+
"输出的语音": "Inference Result",
|
186 |
+
"运行中": " Running",
|
187 |
+
"进度": "Progress",
|
188 |
+
"进程已终止": " Process Terminated",
|
189 |
+
"进程输出信息": " Process Output Information",
|
190 |
+
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Choose the models from SoVITS_weights and GPT_weights. The default one is a pretrain, so you can experience zero shot TTS.",
|
191 |
+
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Sampling Steps: If feel noisy, try increasing, if feel slow, try decreasing",
|
192 |
+
"重复惩罚": "Repetition Penalty",
|
193 |
+
"随机种子": "Random Seed",
|
194 |
+
"需先终止才能开启下一次任务": "Please Terminate First to Start Next Task",
|
195 |
+
"需要合成的切分前文本": "Inference Text Before Segmentation",
|
196 |
+
"需要合成的文本": "Inference text",
|
197 |
+
"需要合成的文本的语种": "Language of the Inference Text",
|
198 |
+
"需要合成的语种": "Inference text language",
|
199 |
+
"韩文": "Korean",
|
200 |
+
"韩英混合": "Korean-English Mixed",
|
201 |
+
"音频加载失败": "Failed to Load Audio",
|
202 |
+
"音频文件不存在,跳过:": "Audio File Not Found, Skipping: ",
|
203 |
+
"音频标注WebUI": "Audio Labeling WebUI",
|
204 |
+
"音频自动切分输入路径,可文件可文件夹": "Audio slicer input (file or folder)",
|
205 |
+
"音频超分中": "Running Audio Super-Resolution",
|
206 |
+
"预训练GPT模型路径": "Pretrained GPT Model Path",
|
207 |
+
"预训练SSL模型路径": "Pretrained SSL Model Path",
|
208 |
+
"预训练SoVITS-D模型路径": "Pretrained SoVITS-D Model Path",
|
209 |
+
"预训练SoVITS-G模型路径": "Pretrained SoVITS-G Model Path",
|
210 |
+
"预训练中文BERT模型路径": "Pretrained Chinese BERT Model Path"
|
211 |
+
}
|
tools/i18n/locale/es_ES.json
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1)MDX-Net (onnx_dereverb): reverberación estéreo, la mejor opción; no puede eliminar reverberación mono",
|
3 |
+
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho: Eliminar el efecto de retardo. Aggressive elimina más que Normal, DeReverb elimina reverberación adicional, puede eliminar reverberación mono, pero no limpia bien la reverberación de placa de alta frecuencia",
|
4 |
+
"*实验/模型名": "*Nombre del experimento/modelo",
|
5 |
+
"*文本标注文件": "*Archivo de etiquetado de texto",
|
6 |
+
"*训练集音频文件目录": "*Directorio de archivos de audio de entrenamiento",
|
7 |
+
"*请上传并填写参考信息": "*Por favor, suba y complete la información de referencia",
|
8 |
+
"*请填写需要合成的目标文本和语种模式": "*Por favor, complete el texto objetivo a sintetizar y el modo de idioma",
|
9 |
+
".限制范围越小判别效果越好。": ".Cuanto más pequeño sea el rango, mejor será el efecto de discriminación.",
|
10 |
+
"1-GPT-SoVITS-TTS": "1-GPT-SoVITS-TTS",
|
11 |
+
"1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;": "1. El modelo DeEcho-DeReverb tarda casi el doble que los otros dos modelos DeEcho",
|
12 |
+
"1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;": "1. Retener voz principal: seleccione este para audio sin coros, retiene mejor la voz principal que HP5. Incluye dos modelos, HP2 y HP3; HP3 puede filtrar ligeramente el acompañamiento pero retiene mejor la voz principal que HP2",
|
13 |
+
"2-GPT-SoVITS-变声": "2-GPT-SoVITS-Cambio de voz",
|
14 |
+
"2、MDX-Net-Dereverb模型挺慢的;": "2. El modelo MDX-Net-Dereverb es bastante lento",
|
15 |
+
"2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;": "2. Solo retener voz principal: seleccione este para audio con coros, puede debilitar la voz principal. Incluye un modelo HP5",
|
16 |
+
"3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "3. La configuración más limpia recomendada es primero MDX-Net, luego DeEcho-Aggressive",
|
17 |
+
"3、去混响、去延迟模型(by FoxJoy):": "3. Modelos de eliminación de reverberación y retardo (por FoxJoy)",
|
18 |
+
"ASR 模型": "Modelo ASR",
|
19 |
+
"ASR 模型尺寸": "Tamaño del modelo ASR",
|
20 |
+
"ASR 语言设置": "Configuración del idioma ASR",
|
21 |
+
"GPT 训练: 模型权重文件在 GPT_weights/": "Entrenamiento de GPT: los archivos de pesos del modelo están en GPT_weights/",
|
22 |
+
"GPT模型列表": "Lista de modelos GPT",
|
23 |
+
"GPT训练": "Entrenamiento de GPT",
|
24 |
+
"GPT采样参数(无参考文本时不要太低。不懂就用默认):": "Parámetros de muestreo de GPT (no demasiado bajos cuando no hay texto de referencia. Use los valores por defecto si no está seguro):",
|
25 |
+
"GPU卡号,只能填1个整数": "Número de tarjeta GPU, solo se puede ingresar un número entero",
|
26 |
+
"GPU卡号以-分割,每个卡号一个进程": "Número de tarjeta GPU separado por '-', cada número de tarjeta es un proceso",
|
27 |
+
"LoRA秩": "Rango de LoRA",
|
28 |
+
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "Falta el modelo base de SoVITS V3, no se pueden cargar los pesos de LoRA correspondientes",
|
29 |
+
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "Entrenamiento de SoVITS: los archivos de pesos del modelo están en SoVITS_weights/",
|
30 |
+
"SoVITS模型列表": "Lista de modelos SoVITS",
|
31 |
+
"SoVITS训练": "Entrenamiento de SoVITS",
|
32 |
+
"TTS推理WebUI": "WebUI de inferencia TTS",
|
33 |
+
"UVR5人声伴奏分离&去混响去延迟工具": "Herramienta de separación de voz y acompañamiento UVR5 y eliminación de reverberación y retardo",
|
34 |
+
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proporción de mezcla de audio normalizado que entra",
|
35 |
+
"batch_size": "Tamaño de lote",
|
36 |
+
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: cómo calcular la curva de volumen, cuanto más pequeño, mayor precisión pero mayor carga computacional (mayor precisión no significa mejor rendimiento)",
|
37 |
+
"max:归一化后最大值多少": "max: valor máximo después de la normalización",
|
38 |
+
"max_sil_kept:切完后静音最多留多长": "max_sil_kept: duración máxima del silencio después del corte",
|
39 |
+
"min_interval:最短切割间隔": "min_interval: intervalo mínimo de corte",
|
40 |
+
"min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length: longitud mínima de cada segmento; si el primer segmento es demasiado corto, se une al siguiente hasta superar este valor",
|
41 |
+
"temperature": "temperatura",
|
42 |
+
"threshold:音量小于这个值视���静音的备选切割点": "umbral: puntos de corte alternativos considerados como silencio si el volumen es menor que este valor",
|
43 |
+
"top_k": "top_k",
|
44 |
+
"top_p": "top_p",
|
45 |
+
"v3暂不支持该模式,使用了会报错。": "v3 no es compatible con este modo actualmente y su uso generará un error.",
|
46 |
+
"v3输出如果觉得闷可以试试开超分": "Si la salida de V3 parece aburrida, puedes intentar activar la superresolución",
|
47 |
+
"不切": "No cortar",
|
48 |
+
"中文": "Chino",
|
49 |
+
"中文教程文档": "Documentación del tutorial en chino",
|
50 |
+
"中英混合": "Chino e inglés mezclados",
|
51 |
+
"主参考音频(请上传3~10秒内参考音频,超过会报错!)": "Audio de referencia principal (Por favor, suba un audio de referencia de entre 3 y 10 segundos, si supera este límite se producirá un error)",
|
52 |
+
"主参考音频的文本": "Texto del audio de referencia principal",
|
53 |
+
"主参考音频的语种": "Idioma del audio de referencia principal",
|
54 |
+
"也可批量输入音频文件, 二选一, 优先读文件夹": "También se pueden ingresar archivos de audio por lotes, seleccionar uno, prioridad para leer carpetas",
|
55 |
+
"人声伴奏分离批量处理, 使用UVR5模型。": "Procesamiento por lotes de separación de voz y acompañamiento utilizando el modelo UVR5",
|
56 |
+
"人声分离WebUI": "WebUI de separación de voces",
|
57 |
+
"人声提取激进程度": "Nivel de agresividad en la extracción de voz",
|
58 |
+
"以下文件或文件夹不存在": "No Existe Tal Archivo o Carpeta",
|
59 |
+
"以下模型不存在:": "No Existe tal Modelo:",
|
60 |
+
"伴奏人声分离&去混响&去回声": "Separación de acompañamiento y voz principal y eliminación de reverberación y eco",
|
61 |
+
"你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载好": "No has descargado los parámetros del modelo de superresolución, por lo que no se realizará la superresolución. Si deseas habilitarla, sigue el tutorial y descarga los archivos necesarios",
|
62 |
+
"使用无参考文本模式时建议使用微调的GPT": "Se recomienda usar un GPT ajustado cuando se use el modo sin texto de referencia.",
|
63 |
+
"保存频率save_every_epoch": "Frecuencia de guardado (cada epoch)",
|
64 |
+
"保持随机": "Mantener aleatorio",
|
65 |
+
"关闭": "Cerrar ",
|
66 |
+
"凑50字一切": "Todo para alcanzar las 50 palabras",
|
67 |
+
"凑四句一切": "Completa cuatro oraciones para rellenar todo",
|
68 |
+
"分桶处理模式已关闭": "Modo de procesamiento por lotes deshabilitado",
|
69 |
+
"分桶处理模式已开启": "Modo de procesamiento por lotes habilitado",
|
70 |
+
"分段返回模式不支持分桶处理,已自动关闭分桶处理": "El modo de retorno segmentado no es compatible con el procesamiento por lotes, se ha deshabilitado automáticamente",
|
71 |
+
"分段返回模式已开启": "Modo de retorno segmentado habilitado",
|
72 |
+
"分段间隔(秒)": "Intervalo de segmentación (segundos)",
|
73 |
+
"分段间隔过小,已自动设置为0.01": "El intervalo de segmentación es demasiado pequeño, se ha ajustado automáticamente a 0.01",
|
74 |
+
"切分": "Segmentación",
|
75 |
+
"切分后文本": "Texto después de la segmentación",
|
76 |
+
"切分后的子音频的输出根目录": "Directorio raíz de salida de los sub-audios después de la división",
|
77 |
+
"切分文本": "Segmentar texto",
|
78 |
+
"切割使用的进程数": "Número de procesos utilizados para la división",
|
79 |
+
"刷新模型路径": "Actualizar la ruta del modelo",
|
80 |
+
"前端处理后的文本(每句):": "Texto después del procesamiento previo (por frase):",
|
81 |
+
"前置数据集获取工具": "Herramienta de adquisición de conjunto de datos previo",
|
82 |
+
"占用中": " En uso",
|
83 |
+
"去混响/去延迟,附:": "Eliminación de reverberación/retardo, incluye:",
|
84 |
+
"参考音频在3~10秒范围外,请更换!": "El audio de referencia está fuera del rango de 3 a 10 segundos, ¡por favor cámbielo!",
|
85 |
+
"参考音频的文本": "Texto de referencia del audio",
|
86 |
+
"参考音频的语种": "Idioma del audio de referencia",
|
87 |
+
"句间停顿秒数": "Segundos de pausa entre frases",
|
88 |
+
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Opcional: Sube varios archivos de audio de referencia arrastrándolos y soltándolos (se recomienda que sean del mismo género) y promedia sus tonos. Si esta opción se deja en blanco, el tono será controlado por el único audio de referencia a la izquierda. Si se está afinando el modelo, se recomienda que todos los archivos de audio de referencia tengan tonos dentro del conjunto de entrenamiento de ajuste fino; se puede ignorar el modelo preentrenado.",
|
89 |
+
"合成语音": "Síntesis de voz",
|
90 |
+
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Ejemplo de formato de ruta de carpeta válida: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (simplemente copie desde la barra de direcciones del administrador de archivos).",
|
91 |
+
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Se añadirá soporte para conversión de fonemas, edición manual de fonemas y síntesis de voz por pasos en el futuro.",
|
92 |
+
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Si el audio de referencia no es claro o no sabe qué escribir, habilite esta opción para ignorar el texto de referencia.",
|
93 |
+
"启用并行推理版本": "Habilitar versión de inferencia paralela",
|
94 |
+
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。": "Ingrese el directorio donde se encuentran los audios después de la división. La ruta completa de los archivos de audio leídos = este directorio + nombre de archivo correspondiente en el archivo .list (no la ruta completa). Si se deja en blanco, se utilizará la ruta completa del archivo .list.",
|
95 |
+
"多语种混合": "Mezcla de varios idiomas",
|
96 |
+
"多语种混合(粤语)": "Mezcla Multilingüe (Cantonés)",
|
97 |
+
"失败": " Fallido",
|
98 |
+
"如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Si no acepta estos términos, no puede utilizar ni hacer referencia a ningún código o archivo dentro del paquete de software. Consulte el archivo LICENSE en el directorio raíz para obtener más detalles.",
|
99 |
+
"实际输入的参考文本:": "Texto de referencia realmente ingresado:",
|
100 |
+
"实际输入的目标文本(切句后):": "Texto objetivo realmente ingresado (después de dividir en frases):",
|
101 |
+
"实际输入的目标文本(每句):": "Texto objetivo realmente ingresado (por frase):",
|
102 |
+
"实际输入的目标文本:": "Texto objetivo realmente ingresado:",
|
103 |
+
"导出文件格式": "Formato de archivo de exportación",
|
104 |
+
"已关闭": " Desactivado",
|
105 |
+
"已完成": " Completado",
|
106 |
+
"已开启": " Activado",
|
107 |
+
"并行推理": "Inferencia paralela",
|
108 |
+
"并行推理模式已关闭": "Modo de inferencia paralela deshabilitado",
|
109 |
+
"并行推理模式已开启": "Modo de inferencia paralela habilitado",
|
110 |
+
"开启": "Activar ",
|
111 |
+
"开启无参考文本模式。不填参考文本亦相当于开启。": "Habilitar el modo sin texto de referencia. No llenar el texto de referencia también lo habilita.",
|
112 |
+
"微调训练": "Entrenamiento de ajuste fino",
|
113 |
+
"怎么切": "Cómo cortar",
|
114 |
+
"总训练轮数total_epoch": "Número total de épocas de entrenamiento",
|
115 |
+
"总训练轮数total_epoch,不建议太高": "Número total de épocas de entrenamiento, no se recomienda demasiado alto",
|
116 |
+
"指定输出主人声文件夹": "Especificar carpeta de salida de voz principal",
|
117 |
+
"指定输出非主人声文件夹": "Especificar carpeta de salida de no voz principal",
|
118 |
+
"按中文句号。切": "Cortar según puntos en chino",
|
119 |
+
"按标点符号切": "Cortar según los signos de puntuación",
|
120 |
+
"按英文句号.切": "Cortar por puntos en inglés.",
|
121 |
+
"推理": "Inferencia",
|
122 |
+
"推理设置": "Configuración de inferencia",
|
123 |
+
"提取文本Bert特征": "Extraer características de texto con BERT",
|
124 |
+
"数据分桶(并行推理时会降低一点计算量)": "Agrupación de datos (Reduce el costo computacional en inferencia paralela)",
|
125 |
+
"数据类型精度": "precisión del tipo de datos",
|
126 |
+
"文本分词与特征提取": "Segmentación de texto y extracción de características",
|
127 |
+
"文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "Herramienta de segmentación de texto. Un texto demasiado largo puede no producir buenos resultados, por lo que se recomienda segmentarlo. La síntesis se realizará por separado según los saltos de línea y luego se unirá.",
|
128 |
+
"文本模块学习率权重": "Peso de la tasa de aprendizaje del módulo de texto",
|
129 |
+
"施工中,请静候佳音": "En construcción, por favor espere pacientemente",
|
130 |
+
"日文": "Japonés",
|
131 |
+
"日英混合": "Mezcla de japonés e inglés",
|
132 |
+
"是否仅保存最新的权重文件以节省硬盘空间": "¿Guardar solo el último archivo de pesos más reciente para ahorrar espacio en disco?",
|
133 |
+
"是否在每次保存时间点将最终小模型保存至weights文件夹": "¿Guardar el modelo final pequeño en la carpeta de pesos en cada punto de guardado?",
|
134 |
+
"是否开启DPO训练选项(实验性)": "¿Habilitar la opción de entrenamiento dpo (experimental)?",
|
135 |
+
"是否直接对上次合成结果调整语速和音色。防止随机性。": "¿Ajustar directamente la velocidad del habla y el tono del último resultado de síntesis? Para prevenir la aleatoriedad.",
|
136 |
+
"显卡信息": "Información de la tarjeta gráfica",
|
137 |
+
"未下载模型": "Modelo no descargado",
|
138 |
+
"本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.": "Este software es de código abierto bajo la licencia MIT. El autor no tiene control sobre el software. El usuario que lo utilice o distribuya, y el que genere sonidos a partir del software, asume toda la responsabilidad.",
|
139 |
+
"标注文件路径 (含文件后缀 *.list)": "Ruta del archivo de anotaciones (con extensión *.list)",
|
140 |
+
"模型": "Modelo",
|
141 |
+
"模型分为三类:": "Los modelos se dividen en tres categorías:",
|
142 |
+
"模型切换": "Cambio de modelo",
|
143 |
+
"每张显卡的batch_size": "Tamaño de lote por tarjeta gráfica",
|
144 |
+
"版本": "Versión",
|
145 |
+
"粤英混合": "Mezcla Cantonés-Inglés",
|
146 |
+
"粤语": "Cantonés",
|
147 |
+
"终止合成": "Terminar síntesis",
|
148 |
+
"缺少Hubert数据集": "Falta el Conjunto de Datos de Hubert",
|
149 |
+
"缺少语义数据集": "Falta el Conjunto de Datos Semánticos",
|
150 |
+
"缺少音素数据集": "Falta el Conjunto de Datos de Fonemas",
|
151 |
+
"缺少音频数据集": "Falta el Conjunto de Datos de Audio",
|
152 |
+
"英文": "Inglés",
|
153 |
+
"训练集格式化一键三连": "Formato del conjunto de entrenamiento en un solo paso",
|
154 |
+
"训练集格式化工具": "Herramienta de formateo del conjunto de datos de entrenamiento",
|
155 |
+
"语义Token提取": "Extracción de tokens semánticos",
|
156 |
+
"语速": "Velocidad de habla",
|
157 |
+
"语速调整,高为更快": "Ajustar la velocidad de habla, más alta para más rápido",
|
158 |
+
"语速调节不支持分桶处理,已自动关闭分桶处理": "El ajuste de velocidad de voz no es compatible con el procesamiento por lotes, se ha deshabilitado automáticamente",
|
159 |
+
"语音切分": "Segmentación de voz",
|
160 |
+
"语音切分工具": "Herramienta de división de voz",
|
161 |
+
"语音文本校对标注工具": "Herramienta de corrección y anotación de texto de voz",
|
162 |
+
"语音自监督特征提取": "Extracción de características de voz con auto-supervisión",
|
163 |
+
"语音识别": "Reconocimiento de voz",
|
164 |
+
"语音识别工具": "Herramienta de reconocimiento de voz",
|
165 |
+
"语音降噪": "Reducción de ruido en la voz",
|
166 |
+
"语音降噪工具": "Herramienta de reducción de ruido de voz",
|
167 |
+
"请上传3~10秒内参考音频,超过会报错!": "Por favor, suba un audio de referencia de entre 3 y 10 segundos, ¡más de eso causará un error!",
|
168 |
+
"请上传参考音频": "Por Favor, Suba el Audio de Referencia",
|
169 |
+
"请填入推理文本": "Por Favor, Ingrese el Texto Objetivo",
|
170 |
+
"请填入正确的List路径": "Por Favor, Introduzca la Ruta Correcta de la Lista",
|
171 |
+
"请填入正确的音频文件夹路径": "Por Favor, Introduzca la Ruta Correcta de la Carpeta de Audio",
|
172 |
+
"请输入有效文本": "Por favor, introduzca un texto válido",
|
173 |
+
"路径不存在,使用默认配置": "Ruta no encontrada, usando configuración predeterminada",
|
174 |
+
"路径不能为空": "Se Espera que la Ruta No Esté Vacía",
|
175 |
+
"路径错误": "Error de Ruta",
|
176 |
+
"转换": "Convertir",
|
177 |
+
"辅参考音频(可选多个,或不选)": "Audio de referencia secundario (Opcional, se pueden seleccionar varios o ninguno)",
|
178 |
+
"输入待处理音频文件夹路径": "Ingrese la ruta de la carpeta de audio a procesar",
|
179 |
+
"输入文件夹路径": "Ingrese la ruta de la carpeta",
|
180 |
+
"输入路径不存在": "La ruta de entrada no existe",
|
181 |
+
"输入路径存在但不可用": "La ruta de entrada existe pero no es accesible",
|
182 |
+
"输出logs/实验名目录下应有23456开头的文件和文件夹": "Debe haber archivos y carpetas que comiencen con 23456 en el directorio logs/nombre del experimento",
|
183 |
+
"输出信息": "Información de salida",
|
184 |
+
"输出文件夹路径": "Ruta de la carpeta de salida",
|
185 |
+
"输出的语音": "Audio de salida",
|
186 |
+
"运行中": " En ejecución",
|
187 |
+
"进度": "Progreso",
|
188 |
+
"进程已终止": " Proceso terminado",
|
189 |
+
"进程输出信息": " Información de salida del proceso",
|
190 |
+
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Seleccione el modelo almacenado en SoVITS_weights y GPT_weights después del entrenamiento. Uno de ellos es el modelo base, útil para experimentar con TTS de 5 segundos sin entrenamiento.",
|
191 |
+
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Pasos de muestreo: si se siente ruidoso, intente aumentarlo; si es lento, intente reducirlo",
|
192 |
+
"重复惩罚": "Penalización por repetición",
|
193 |
+
"随机种子": "Semilla aleatoria",
|
194 |
+
"需先终止才能开启下一次任务": " Debe terminarse antes de iniciar la siguiente tarea",
|
195 |
+
"需要合成的切分前文本": "Texto antes de la segmentación para la síntesis",
|
196 |
+
"需要合成的文本": "Texto a sintetizar",
|
197 |
+
"需要合成的文本的语种": "Idioma del texto a sintetizar",
|
198 |
+
"需要合成的语种": "Idioma para la síntesis",
|
199 |
+
"韩文": "Coreano",
|
200 |
+
"韩英混合": "Mezcla Coreano-Inglés",
|
201 |
+
"音频加载失败": "Error al Cargar el Audio",
|
202 |
+
"音频文件不存在,跳过:": "Archivo de audio no encontrado, omitiendo: ",
|
203 |
+
"音频标注WebUI": "WebUI de etiquetado de audio",
|
204 |
+
"音频自动切分输入路径,可文件可文件夹": "Ruta de entrada para la división automática de audio, puede ser un archivo o una carpeta",
|
205 |
+
"音频超分中": "Superresolución de audio en proceso",
|
206 |
+
"预训练GPT模型路径": "Ruta del modelo GPT preentrenado",
|
207 |
+
"预训练SSL模型路径": "Ruta del modelo SSL preentrenado",
|
208 |
+
"预训练SoVITS-D模型路径": "Ruta del modelo SoVITS-D preentrenado",
|
209 |
+
"预训练SoVITS-G模型路径": "Ruta del modelo SoVITS-G preentrenado",
|
210 |
+
"预训练中文BERT模型路径": "Ruta del modelo BERT en Chino preentrenado"
|
211 |
+
}
|
tools/i18n/locale/fr_FR.json
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;": "(1) MDX-Net (onnx_dereverb) : C'est le meilleur choix pour la réverbération à deux canaux, mais il ne peut pas éliminer la réverbération à un seul canal;",
|
3 |
+
"(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。": "(234)DeEcho : Supprime les effets de délai. Aggressive est plus exhaustif que Normal dans la suppression, DeReverb élimine également la réverbération, peut supprimer la réverbération monocanal, mais n'élimine pas complètement la réverbération de plaque à haute fréquence.",
|
4 |
+
"*实验/模型名": "*Nom de l'expérience/modèle",
|
5 |
+
"*文本标注文件": "*Fichier d'annotation de texte",
|
6 |
+
"*训练集音频文件目录": "*Répertoire des fichiers audio d'entraînement",
|
7 |
+
"*请上传并填写参考信息": "*Veuillez télécharger et remplir les informations de référence",
|
8 |
+
"*请填写需要合成的目标文本和语种模式": "*Veuillez saisir le texte cible à synthétiser et le mode de langue.",
|
9 |
+
".限制范围越小判别效果越好。": "Moins il y a de langues, mieux c'est",
|
10 |
+
"1-GPT-SoVITS-TTS": "1-GPT-SoVITS-TTS",
|
11 |
+
"1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;": "1. Le temps de traitement du modèle DeEcho-DeReverb est presque le double de celui des deux autres modèles DeEcho;",
|
12 |
+
"1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;": "1. Préserver les voix : Choisissez cette option pour les audio sans harmonie, car elle conserve mieux la voix principale par rapport au modèle HP5. Deux modèles intégrés, HP2 et HP3, sont disponibles. HP3 peut légèrement laisser passer l'accompagnement mais conserve la voix principale un peu mieux que HP2;",
|
13 |
+
"2-GPT-SoVITS-变声": "2-GPT-SoVITS-Modification de la voix",
|
14 |
+
"2、MDX-Net-Dereverb模型挺慢的;": "2. Le modèle MDX-Net-Dereverb est assez lent;",
|
15 |
+
"2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;": "2. Conserver uniquement la voix principale : Choisissez cette option pour les audio avec harmonie, car elle peut affaiblir la voix principale. Un modèle HP5 intégré est disponible;",
|
16 |
+
"3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。": "3. La configuration la plus propre que je recommande est d'utiliser d'abord MDX-Net, puis DeEcho-Aggressive.",
|
17 |
+
"3、去混响、去延迟模型(by FoxJoy):": "3. Modèle de suppression de réverbération et de retard (par FoxJoy) :",
|
18 |
+
"ASR 模型": "Modèle ASR",
|
19 |
+
"ASR 模型尺寸": "Taille du modèle ASR",
|
20 |
+
"ASR 语言设置": "Paramètres de langue ASR",
|
21 |
+
"GPT 训练: 模型权重文件在 GPT_weights/": "Entraînement GPT : les poids du modèle sont dans GPT_weights/",
|
22 |
+
"GPT模型列表": "Liste des modèles GPT",
|
23 |
+
"GPT训练": "Entraînement GPT",
|
24 |
+
"GPT采样参数(无参考文本时不要太低。不懂就用默认):": "Paramètres d'échantillonnage de GPT (ne pas mettre trop bas lorsqu'il n'y a pas de texte de référence. Utilisez les valeurs par défaut si vous n'êtes pas sûr):",
|
25 |
+
"GPU卡号,只能填1个整数": "Numéro de carte GPU, ne peut contenir qu'un seul entier",
|
26 |
+
"GPU卡号以-分割,每个卡号一个进程": "Numéro de carte GPU séparé par des tirets, un processus par numéro de carte",
|
27 |
+
"LoRA秩": "Rang LoRA",
|
28 |
+
"SoVITS V3 底模缺失,无法加载相应 LoRA 权重": "Modèle de base SoVITS V3 manquant, impossible de charger les poids LoRA correspondants",
|
29 |
+
"SoVITS 训练: 模型权重文件在 SoVITS_weights/": "Entraînement SoVITS : les poids du modèle sont dans SoVITS_weights/",
|
30 |
+
"SoVITS模型列表": "Liste des modèles SoVITS",
|
31 |
+
"SoVITS训练": "Entraînement SoVITS",
|
32 |
+
"TTS推理WebUI": "Interface Web d'inférence TTS",
|
33 |
+
"UVR5人声伴奏分离&去混响去延迟工具": "Outil UVR5 de séparation voix/accompagnement & suppression de réverbération et de latence",
|
34 |
+
"alpha_mix:混多少比例归一化后音频进来": "alpha_mix: proportion d'audio normalisé mélangé",
|
35 |
+
"batch_size": "Taille de lot",
|
36 |
+
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: comment calculer la courbe de volume, plus petit pour une précision plus élevée mais une charge de calcul plus élevée (ce n'est pas une meilleure précision)",
|
37 |
+
"max:归一化后最大值多少": "max: valeur maximale après normalisation",
|
38 |
+
"max_sil_kept:切完后静音最多留多长": "max_sil_kept: durée maximale de silence après la coupe",
|
39 |
+
"min_interval:最短切割间隔": "min_interval: intervalle de coupe minimum",
|
40 |
+
"min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length:longueur minimale de chaque segment ; si le premier segment est trop court, il est concaténé avec les segments suivants jusqu'à ce que la longueur dépasse cette valeur",
|
41 |
+
"temperature": "température",
|
42 |
+
"threshold:音量小于这个值视作静音的备选切割点": "seuil: le volume inférieur à cette valeur est considéré comme un point de coupe silencieux alternatif",
|
43 |
+
"top_k": "top_k",
|
44 |
+
"top_p": "top_p",
|
45 |
+
"v3暂不支持该模式,使用了会报错。": "Le mode n'est pas encore supporté par la v3. Une erreur se produira en cas d'utilisation.",
|
46 |
+
"v3输出如果觉得闷可以试试开超分": "Si la sortie v3 semble étouffée, essayez l'upscaling",
|
47 |
+
"不切": "Pas de découpe",
|
48 |
+
"中文": "Chinois",
|
49 |
+
"中文教程文档": "Documentation du tutoriel en chinois",
|
50 |
+
"中英混合": "Mélange de chinois et d'anglais",
|
51 |
+
"主参考音频(请上传3~10秒内参考音频,超过会报错!)": "Audio de référence principal (Veuillez télécharger un audio de référence entre 3 et 10 secondes, dépasser cette limite entraînera une erreur !)",
|
52 |
+
"主参考音频的文本": "Texte de l’audio de référence principal",
|
53 |
+
"主参考音频的语种": "Langue de l’audio de référence principal",
|
54 |
+
"也可批量输入音频文件, 二选一, 优先读文件夹": "Également possible d'entrer en lot des fichiers audio, au choix, privilégiez la lecture du dossier",
|
55 |
+
"人声伴奏分离批量处理, 使用UVR5模型。": "Traitement par lot de séparation voix-accompagnement en utilisant le modèle UVR5.",
|
56 |
+
"人声分离WebUI": "Interface Web de séparation des voix",
|
57 |
+
"人声提取激进程度": "Degré d'extraction des voix",
|
58 |
+
"以下文件或文件夹不存在": "Aucun Fichier ou Dossier de ce Type",
|
59 |
+
"以下模型不存在:": "Aucun Modèle de ce Type:",
|
60 |
+
"伴奏人声分离&去混响&去回声": "Séparation de la voix et de l'accompagnement, suppression de la réverbération et de l'écho",
|
61 |
+
"你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载好": "Vous n'avez pas téléchargé les paramètres du modèle d'upscaling, donc l'upscaling ne sera pas effectué. Veuillez suivre le tutoriel pour télécharger les fichiers nécessaires",
|
62 |
+
"使用无参考文本模式时建议使用微调的GPT": "Il est recommandé d'utiliser un GPT finement ajusté lors de l'utilisation du mode sans texte de référence.",
|
63 |
+
"保存频率save_every_epoch": "Fréquence de sauvegarde (sauvegarder à chaque époque)",
|
64 |
+
"保持随机": "Garder aléatoire",
|
65 |
+
"关闭": "Fermer ",
|
66 |
+
"凑50字一切": "Assembler 50 mots tout",
|
67 |
+
"凑四句一切": "Composez quatre phrases pour tout remplir",
|
68 |
+
"分桶处理模式已关闭": "Mode de traitement par regroupement désactivé",
|
69 |
+
"分桶处理模式已开启": "Mode de traitement par regroupement activé",
|
70 |
+
"分段返回模式不支持分桶处理,已自动关闭分桶处理": "Le mode de retour segmenté ne prend pas en charge le regroupement des données, désactivation automatique",
|
71 |
+
"分段返回模式已开启": "Mode de retour segmenté activé",
|
72 |
+
"分段间隔(秒)": "Intervalle de segmentation (secondes)",
|
73 |
+
"分段间隔过小,已自动设置为0.01": "L’intervalle de segmentation est trop court, réglé automatiquement à 0.01",
|
74 |
+
"切分": "Segmentation",
|
75 |
+
"切分后文本": "Texte après segmentation",
|
76 |
+
"切分后的子音频的输出根目录": "Répertoire racine de sortie des sous-audios après découpage",
|
77 |
+
"切分文本": "Segmenter le texte",
|
78 |
+
"切割使用的进程数": "Nombre de processus utilisés pour le découpage",
|
79 |
+
"刷新模型路径": "Actualiser le chemin du modèle",
|
80 |
+
"前端处理后的文本(每句):": "Texte après traitement frontal (par phrase):",
|
81 |
+
"前置数据集获取工具": "Outil de récupération des ensembles de données",
|
82 |
+
"占用中": " Occupé",
|
83 |
+
"去混响/去延迟,附:": "Suppression de la réverbération / suppression du retard, ci-joint:",
|
84 |
+
"参考音频在3~10秒范围外,请更换!": "Veuillez remplacer l'audio de référence si sa durée est en dehors de la plage de 3 à 10 secondes!",
|
85 |
+
"参考音频的文本": "Texte de l'audio de référence",
|
86 |
+
"参考音频的语种": "Langue de l'audio de référence",
|
87 |
+
"句间停顿秒数": "Temps de pause entre les phrases (secondes)",
|
88 |
+
"可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。": "Optionnel : Téléchargez plusieurs fichiers audio de référence en les faisant glisser (recommandé d'être du même genre) et fusionnez leur tonalité. Si cette option est laissée vide, la tonalité sera contrôlée par l'unique fichier audio de référence à gauche. Si vous ajustez le modèle, il est recommandé que tous les fichiers audio de référence aient des tonalités dans l'ensemble d'entraînement d'ajustement ; le modèle pré-entrainé peut être ignoré.",
|
89 |
+
"合成语音": "Synthèse vocale",
|
90 |
+
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Exemple de format de chemin de dossier valide : E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例 (copiez-le depuis la barre d'adresse de l'explorateur de fichiers).",
|
91 |
+
"后续将支持转音素、手工修改音素、语音合成分步执行。": "Le support pour la conversion phonémique, l’édition manuelle des phonèmes et la synthèse vocale par étapes sera ajouté ultérieurement.",
|
92 |
+
"听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。": "Si vous ne comprenez pas bien l'audio de référence (vous ne savez pas quoi écrire), vous pouvez activer cette option. Une fois activée, le texte de référence sera ignoré.",
|
93 |
+
"启用并行推理版本": "Activer la version d’inférence parallèle",
|
94 |
+
"填切割后音频所在目录!读取的音频文件完整路径=该目录-拼接-list文件里波形对应的文件名(不是全路径)。如果留空则使用.list文件里的绝对全路径。": "Veuillez indiquer le répertoire contenant les audio découpés ! Le chemin complet du fichier audio à lire = ce répertoire - nom du fichier correspondant à l'onde dans le fichier .list (pas le chemin complet). Si laissé vide, le chemin absolu dans le fichier .list sera utilisé.",
|
95 |
+
"多语种混合": "Mélange multilingue",
|
96 |
+
"多语种混合(粤语)": "Mélange Multilingue (Cantonais)",
|
97 |
+
"失败": " Échec",
|
98 |
+
"如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "Si vous n'acceptez pas ces conditions, vous ne pouvez ni utiliser ni citer aucun code ou fichier du package logiciel. Voir LICENSE à la racine.",
|
99 |
+
"实际输入的参考文本:": "Texte de référence réellement saisi:",
|
100 |
+
"实际输入的目标文本(切句后):": "Texte cible réellement saisi (après découpage):",
|
101 |
+
"实际输入的目标文本(每句):": "Texte cible réellement saisi (par phrase):",
|
102 |
+
"实际输入的目标文本:": "Texte cible réellement saisi:",
|
103 |
+
"导出文件格式": "Format d'exportation du fichier",
|
104 |
+
"已关闭": " Fermé",
|
105 |
+
"已完成": " Terminé",
|
106 |
+
"已开启": " Activé",
|
107 |
+
"并行推理": "Inférence parallèle",
|
108 |
+
"并行推理模式已关闭": "Mode d’inférence parallèle désactivé",
|
109 |
+
"并行推理模式已开启": "Mode d’inférence parallèle activé",
|
110 |
+
"开启": "Activer ",
|
111 |
+
"开启无参考文本模式。不填参考文本亦相当于开启。": "Activer le mode sans texte de référence. Laisser le texte de référence vide équivaut également à activer le mode.",
|
112 |
+
"微调训练": "Entraînement de fine-tuning",
|
113 |
+
"怎么切": "Comment découper",
|
114 |
+
"总训练轮数total_epoch": "Nombre total d'époques d'entraînement",
|
115 |
+
"总训练轮数total_epoch,不建议太高": "Nombre total d'époques d'entraînement, pas recommandé d'être trop élevé",
|
116 |
+
"指定输出主人声文件夹": "Spécifier le dossier de sortie pour la voix principale",
|
117 |
+
"指定输出非主人声文件夹": "Spécifier le dossier de sortie pour la non-voix principale",
|
118 |
+
"按中文句号。切": "Couper selon les points en chinois.",
|
119 |
+
"按标点符号切": "Couper selon les signes de ponctuation",
|
120 |
+
"按英文句号.切": "Découpez par des points en anglais",
|
121 |
+
"推理": "Inférence",
|
122 |
+
"推理设置": "Paramètres d’inférence",
|
123 |
+
"提取文本Bert特征": "Extraire les caractéristiques du texte avec BERT",
|
124 |
+
"数据分桶(并行推理时会降低一点计算量)": "Regroupement des données (Réduit le coût de calcul en inférence parallèle)",
|
125 |
+
"数据类型精度": "précision du type de données",
|
126 |
+
"文本分词与特征提取": "Segmentation et extraction de caractéristiques du texte",
|
127 |
+
"文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。": "Outil de segmentation de texte. Un texte trop long peut donner un mauvais résultat, il est donc recommandé de le segmenter. La synthèse se fera selon les sauts de ligne puis sera assemblée.",
|
128 |
+
"文本模块学习率权重": "Poids du taux d'apprentissage du module de texte",
|
129 |
+
"施工中,请静候佳音": "En construction, veuillez attendre patiemment",
|
130 |
+
"日文": "Japonais",
|
131 |
+
"日英混合": "Mélange Japonais-Anglais",
|
132 |
+
"是否仅保存最新的权重文件以节省硬盘空间": "Faut-il ne conserver que les derniers fichiers de poids pour économiser de l'espace disque ?",
|
133 |
+
"是否在每次保存时间点将最终小模型保存至weights文件夹": "Sauvegarder le petit modèle final dans le dossier weights à chaque point de sauvegarde",
|
134 |
+
"是否开启DPO训练���项(实验性)": "Activer l'option d'entraînement DPO (expérimental) ?",
|
135 |
+
"是否直接对上次合成结果调整语速和音色。防止随机性。": "Ajuster la vitesse de parole et la tonalité du dernier résultat de synthèse pour prévenir l'aléatoire.",
|
136 |
+
"显卡信息": "Informations sur la carte graphique",
|
137 |
+
"未下载模型": "Modèle non téléchargé",
|
138 |
+
"本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.": "Ce logiciel est open-source sous licence MIT. L'auteur n'exerce aucun contrôle sur le logiciel. L'utilisateur et toute personne diffusant les sorties audio générées sont entièrement responsables.",
|
139 |
+
"标注文件路径 (含文件后缀 *.list)": "Chemin du fichier d'annotation (avec l'extension *.list)",
|
140 |
+
"模型": "Modèle",
|
141 |
+
"模型分为三类:": "Les modèles sont classés en trois catégories:",
|
142 |
+
"模型切换": "Changement de modèle",
|
143 |
+
"每张显卡的batch_size": "Taille de lot par carte graphique",
|
144 |
+
"版本": "Version",
|
145 |
+
"粤英混合": "Mélange Cantonais-Anglais",
|
146 |
+
"粤语": "Cantonais",
|
147 |
+
"终止合成": "Terminer la synthèse",
|
148 |
+
"缺少Hubert数据集": "Jeu de Données Hubert Manquant",
|
149 |
+
"缺少语义数据集": "Jeu de Données Sémantiques Manquant",
|
150 |
+
"缺少音素数据集": "Jeu de Données de Phonèmes Manquant",
|
151 |
+
"缺少音频数据集": "Jeu de Données Audio Manquant",
|
152 |
+
"英文": "Anglais",
|
153 |
+
"训练集格式化一键三连": "Formatage de l'ensemble d'entraînement en un clic",
|
154 |
+
"训练集格式化工具": "Outil de formatage des ensembles d'entraînement",
|
155 |
+
"语义Token提取": "Extraction de tokens sémantiques",
|
156 |
+
"语速": "Débit de parole",
|
157 |
+
"语速调整,高为更快": "Ajuster la vitesse de parole, plus élevée pour plus rapide",
|
158 |
+
"语速调节不支持分桶处理,已自动关闭分桶处理": "Le réglage de la vitesse vocale ne prend pas en charge le regroupement des données, désactivation automatique",
|
159 |
+
"语音切分": "Segmentation vocale",
|
160 |
+
"语音切分工具": "Outil de segmentation vocale",
|
161 |
+
"语音文本校对标注工具": "Outil d'annotation et de correction des transcriptions vocales",
|
162 |
+
"语音自监督特征提取": "Extraction de caractéristiques auto-supervisée pour l'audio",
|
163 |
+
"语音识别": "Reconnaissance vocale",
|
164 |
+
"语音识别工具": "Outil de reconnaissance vocale",
|
165 |
+
"语音降噪": "Réduction du bruit audio",
|
166 |
+
"语音降噪工具": "Outil de réduction du bruit audio",
|
167 |
+
"请上传3~10秒内参考音频,超过会报错!": "Veuillez télécharger une référence audio de 3 à 10 secondes ; les fichiers plus longs généreront une erreur!",
|
168 |
+
"请上传参考音频": "Veuillez télécharger l'audio de référence",
|
169 |
+
"请填入推理文本": "Veuillez remplir le texte cible",
|
170 |
+
"请填入正确的List路径": "Veuillez Remplir le Chemin Correct de la Liste",
|
171 |
+
"请填入正确的音频文件夹路径": "Veuillez Remplir le Chemin Correct du Dossier Audio",
|
172 |
+
"请输入有效文本": "Veuillez entrer un texte valide",
|
173 |
+
"路径不存在,使用默认配置": "Chemin introuvable, utilisation de la configuration par défaut",
|
174 |
+
"路径不能为空": "Chemin Non Vide Attendu",
|
175 |
+
"路径错误": "Erreur de Chemin",
|
176 |
+
"转换": "Conversion",
|
177 |
+
"辅参考音频(可选多个,或不选)": "Audio de référence secondaire (Facultatif, plusieurs possibles ou aucun)",
|
178 |
+
"输入待处理音频文件夹路径": "Entrez le chemin du dossier audio à traiter",
|
179 |
+
"输入文件夹路径": "Chemin du dossier à entrer",
|
180 |
+
"输入路径不存在": "Le chemin d'entrée n'existe pas",
|
181 |
+
"输入路径存在但不可用": "Le chemin d'entrée existe mais est inutilisable",
|
182 |
+
"输出logs/实验名目录下应有23456开头的文件和文件夹": "Les fichiers et dossiers commençant par 23456 devraient être présents dans le répertoire logs/nom de l'expérience",
|
183 |
+
"输出信息": "Sortie d'information",
|
184 |
+
"输出文件夹路径": "Chemin du dossier de sortie",
|
185 |
+
"输出的语音": "Audio de sortie",
|
186 |
+
"运行中": " en cours d'exécution",
|
187 |
+
"进度": "Progression",
|
188 |
+
"进程已终止": " Processus terminé",
|
189 |
+
"进程输出信息": " Sortie du processus",
|
190 |
+
"选择训练完存放在SoVITS_weights和GPT_weights下的模型。默认的一个是底模,体验5秒Zero Shot TTS用。": "Choisissez le modèle entraîné stocké sous SoVITS_weights et GPT_weights. Par défaut, l'un d'eux est un modèle de base pour l'expérience de TTS Zero Shot de 5 secondes.",
|
191 |
+
"采样步数,如果觉得电,提高试试,如果觉得慢,降低试试": "Nombre d’étapes d’échantillonnage : si le son est bruité, essayez d’augmenter, si c’est lent, essayez de réduire",
|
192 |
+
"重复惩罚": "Pénalité de répétition",
|
193 |
+
"随机种子": "Graine aléatoire",
|
194 |
+
"需先终止才能开启下一次任务": "Il faut d'abord arrêter le processus avant de lancer une nouvelle tâche",
|
195 |
+
"需要合成的切分前文本": "Texte avant segmentation pour la synthèse",
|
196 |
+
"需要合成的文本": "Texte à synthétiser",
|
197 |
+
"需要合成的文本的语种": "Langue du texte à synthétiser",
|
198 |
+
"需要合成的语种": "Langue de synthèse requise",
|
199 |
+
"韩文": "Coreano",
|
200 |
+
"韩英混合": "Mezcla Coreano-Inglés",
|
201 |
+
"音频加载失败": "Échec du Chargement de l'Audio",
|
202 |
+
"音频文件不存在,跳过:": "Fichier audio introuvable, passage : ",
|
203 |
+
"音频标注WebUI": "Interface Web d'annotation audio",
|
204 |
+
"音频自动切分输入路径,可文件可文件夹": "Chemin d'entrée automatique de découpage audio, peut être un fichier ou un dossier",
|
205 |
+
"音频超分中": "Upscaling audio en cours",
|
206 |
+
"预训练GPT模型路径": "Chemin du modèle GPT pré-entraîné",
|
207 |
+
"预训练SSL模型路径": "Chemin du modèle SSL pré-entraîné",
|
208 |
+
"预训练SoVITS-D模型路径": "Chemin du modèle SoVITS-D pré-entraîné",
|
209 |
+
"预训练SoVITS-G模型路径": "Chemin du modèle SoVITS-G pré-entraîné",
|
210 |
+
"预训练中文BERT模型路径": "Chemin du modèle BERT chinois pré-entraîné"
|
211 |
+
}
|