Kit-Lemonfoot
commited on
Commit
β’
8bf7162
1
Parent(s):
230c0f6
Of course it didn't work. Now I get to go on another patching odyssey! Yay!
Browse files- GPT_SoVITS/inference_webui.py +271 -271
GPT_SoVITS/inference_webui.py
CHANGED
@@ -1,271 +1,271 @@
|
|
1 |
-
# Based on GPT-SoVITS-fast-inference by ChasonJiang
|
2 |
-
|
3 |
-
import random
|
4 |
-
import os
|
5 |
-
import torch
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
logging.getLogger("
|
15 |
-
logging.getLogger("
|
16 |
-
logging.getLogger("
|
17 |
-
logging.getLogger("
|
18 |
-
logging.getLogger("
|
19 |
-
logging.getLogger("
|
20 |
-
|
21 |
-
import
|
22 |
-
|
23 |
-
|
24 |
-
infer_ttswebui =
|
25 |
-
|
26 |
-
is_share =
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
#
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
import
|
39 |
-
from TTS_infer_pack.
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
"
|
46 |
-
"
|
47 |
-
"
|
48 |
-
"
|
49 |
-
"
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
"
|
55 |
-
"
|
56 |
-
"
|
57 |
-
"
|
58 |
-
"
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
tts_config
|
63 |
-
tts_config.
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
def inference(name, gptmp, svmp, sty, text, text_lang,
|
82 |
-
ref_audio_path, prompt_text,
|
83 |
-
prompt_lang, top_k,
|
84 |
-
top_p, temperature,
|
85 |
-
text_split_method, batch_size,
|
86 |
-
speed_factor,
|
87 |
-
split_bucket,fragment_interval,
|
88 |
-
seed, keep_random, parallel_infer,
|
89 |
-
repetition_penalty
|
90 |
-
):
|
91 |
-
|
92 |
-
global clm
|
93 |
-
#Live switching
|
94 |
-
if(not ref_audio_path):
|
95 |
-
ref_audio_path=f"referenceaudio/{name}/"+referencedata[name][0][sty]
|
96 |
-
prompt_text=referencedata[name][1][sty]
|
97 |
-
if clm!=name:
|
98 |
-
print(f"Switching to model {name}")
|
99 |
-
clm=name
|
100 |
-
tts_pipeline.init_t2s_weights(gptmp)
|
101 |
-
tts_pipeline.init_vits_weights(svmp)
|
102 |
-
|
103 |
-
seed = -1 if keep_random else seed
|
104 |
-
actual_seed = seed if seed not in [-1, "", None] else random.randrange(1 << 32)
|
105 |
-
print(f"TMP: {temperature} | SPDFCT: {speed_factor} | STY: {sty} | LANG: {text_lang}")
|
106 |
-
inputs={
|
107 |
-
"text": text,
|
108 |
-
"text_lang": dict_language[text_lang],
|
109 |
-
"ref_audio_path": ref_audio_path,
|
110 |
-
"prompt_text": prompt_text,
|
111 |
-
"prompt_lang": dict_language[prompt_lang],
|
112 |
-
"top_k": top_k,
|
113 |
-
"top_p": top_p,
|
114 |
-
"temperature": temperature,
|
115 |
-
"text_split_method": cut_method[text_split_method],
|
116 |
-
"batch_size":int(batch_size),
|
117 |
-
"speed_factor":float(speed_factor),
|
118 |
-
"split_bucket":split_bucket,
|
119 |
-
"return_fragment":False,
|
120 |
-
"fragment_interval":fragment_interval,
|
121 |
-
"seed":actual_seed,
|
122 |
-
"parallel_infer": parallel_infer,
|
123 |
-
"repetition_penalty": repetition_penalty,
|
124 |
-
}
|
125 |
-
for item in tts_pipeline.run(inputs):
|
126 |
-
yield item, actual_seed
|
127 |
-
|
128 |
-
def custom_sort_key(s):
|
129 |
-
# δ½Ώη¨ζ£ε葨达εΌζεε符串δΈηζ°ει¨εειζ°ει¨ε
|
130 |
-
parts = re.split('(\d+)', s)
|
131 |
-
# ε°ζ°ει¨ε转ζ’δΈΊζ΄ζ°οΌιζ°ει¨εδΏζδΈε
|
132 |
-
parts = [int(part) if part.isdigit() else part for part in parts]
|
133 |
-
return parts
|
134 |
-
|
135 |
-
|
136 |
-
def change_choices():
|
137 |
-
SoVITS_names, GPT_names = get_weights_names()
|
138 |
-
return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names, key=custom_sort_key), "__type__": "update"}
|
139 |
-
|
140 |
-
|
141 |
-
pretrained_sovits_name = "GPT_SoVITS/pretrained_models/s2G488k.pth"
|
142 |
-
pretrained_gpt_name = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
|
143 |
-
SoVITS_weight_root = "GPT_SoVITS/SoVITS_weights/"
|
144 |
-
GPT_weight_root = "GPT_SoVITS/GPT_weights/"
|
145 |
-
|
146 |
-
def get_weights_names():
|
147 |
-
SoVITS_names = [pretrained_sovits_name]
|
148 |
-
for name in os.listdir(SoVITS_weight_root):
|
149 |
-
if name.endswith(".pth"): SoVITS_names.append("%s/%s" % (SoVITS_weight_root, name))
|
150 |
-
GPT_names = [pretrained_gpt_name]
|
151 |
-
for name in os.listdir(GPT_weight_root):
|
152 |
-
if name.endswith(".ckpt"): GPT_names.append("%s/%s" % (GPT_weight_root, name))
|
153 |
-
return SoVITS_names, GPT_names
|
154 |
-
|
155 |
-
def load_models():
|
156 |
-
print("Loading models...")
|
157 |
-
voices=[]
|
158 |
-
ustyles={}
|
159 |
-
with open("voicelist.json", "r", encoding="utf-8") as f:
|
160 |
-
voc_info = json.load(f)
|
161 |
-
for name, info in voc_info.items():
|
162 |
-
if not info['enable']:
|
163 |
-
continue
|
164 |
-
title= info['title']
|
165 |
-
gptmodelpath= "%s/%s" % (GPT_weight_root, info['gpt_model_path'])
|
166 |
-
sovitsmodelpath= "%s/%s" % (SoVITS_weight_root, info['sovits_model_path'])
|
167 |
-
author= info['modelauthor']
|
168 |
-
image = info['cover']
|
169 |
-
styles = info['styles']
|
170 |
-
#check that all styles properly exist
|
171 |
-
for s in styles.values():
|
172 |
-
if(not os.path.exists(f"referenceaudio/{name}/{s}")):
|
173 |
-
print(f"WARNING : Some defined preset styles do not exist for model {name}, skipping")
|
174 |
-
styles=None
|
175 |
-
break
|
176 |
-
styletrans = info['styletrans']
|
177 |
-
st=[styles, styletrans]
|
178 |
-
voices.append((name, title, gptmodelpath, sovitsmodelpath, author, image))
|
179 |
-
ustyles[name]=st
|
180 |
-
print(f"Indexed model {title}")
|
181 |
-
return voices, ustyles
|
182 |
-
|
183 |
-
modeldata, referencedata = load_models()
|
184 |
-
|
185 |
-
#Gradio preload
|
186 |
-
text = gr.TextArea(label="Input Text", value="Hello there! This is test audio of a new text to speech tool.")
|
187 |
-
text_language = gr.Dropdown(label="Language", choices=["EN", "JP", "ZH", "ZH/EN", "JP/EN", "Automatic"], value="EN")
|
188 |
-
how_to_cut = gr.Dropdown(label="Slicing Method",
|
189 |
-
choices=["None", "4 Sentences", "50 Characters", "ZH/JP Punctuation", "EN Punctuation", "All Punctuation" ],
|
190 |
-
value="4 Sentences",
|
191 |
-
interactive=True,
|
192 |
-
)
|
193 |
-
top_k = gr.Slider(minimum=1,maximum=100,step=1,label="Top_k",value=5,interactive=True)
|
194 |
-
top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label="Top_p",value=1,interactive=True)
|
195 |
-
temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label="Temperature",value=0.7,interactive=True)
|
196 |
-
batch_size = gr.Slider(minimum=1,maximum=200,step=1,label="Batch Size",value=20,interactive=True)
|
197 |
-
fragment_interval = gr.Slider(minimum=0.01,maximum=1,step=0.01,label="Fragment Interval",value=0.3,interactive=True)
|
198 |
-
speed_factor = gr.Slider(minimum=0.50,maximum=2,step=0.05,label="Speed Factor",value=1.0,interactive=True)
|
199 |
-
repetition_penalty = gr.Slider(minimum=0,maximum=2,step=0.05,label="Repetition Penalty",value=1.35,interactive=True)
|
200 |
-
parallel_infer = gr.Checkbox(label="Parallel Infer", value=True, interactive=True, show_label=True)
|
201 |
-
split_bucket = gr.Checkbox(label="Split Bucket", value=True, interactive=True, show_label=True)
|
202 |
-
seed = gr.Number(label="Random Seed",value=-1, interactive=True, show_label=True)
|
203 |
-
keep_random = gr.Checkbox(label="Use Randomized Seed", value=True, interactive=True, show_label=True)
|
204 |
-
|
205 |
-
#Main gradio
|
206 |
-
with gr.Blocks(title="Lemonfoot GPT-SoVITS") as app:
|
207 |
-
gr.Markdown(
|
208 |
-
"# Lemonfoot GPT-SoVITS ππ\n"
|
209 |
-
"### Space by Kit Lemonfoot / Noel Shirogane's High Flying Birds\n"
|
210 |
-
"Based on code originally by RVC_Boss and ChasonJiang\n\n"
|
211 |
-
"Do no evil.\n\n"
|
212 |
-
)
|
213 |
-
for (name, title, gptmodelpath, sovitsmodelpath, author, image) in modeldata:
|
214 |
-
with gr.TabItem(name):
|
215 |
-
with gr.Row():
|
216 |
-
with gr.Column():
|
217 |
-
n = gr.Textbox(value=name, visible=False, interactive=False)
|
218 |
-
gptmp = gr.Textbox(value=gptmodelpath, visible=False, interactive=False)
|
219 |
-
svmp = gr.Textbox(value=sovitsmodelpath, visible=False, interactive=False)
|
220 |
-
gr.Markdown(f"**{title}**\n\n Dataset author: {author}")
|
221 |
-
gr.Image(f"images/{image}", label=None, show_label=False, width=300, show_download_button=False, container=False, show_share_button=False)
|
222 |
-
with gr.Column():
|
223 |
-
#if there isn't any styles don't bother rendering the style window
|
224 |
-
if(not referencedata[name][0]==None):
|
225 |
-
rd = list(referencedata[name][0].keys())
|
226 |
-
with gr.TabItem("Style using a preset"):
|
227 |
-
sty = gr.Dropdown(
|
228 |
-
label="Current style",
|
229 |
-
choices=rd,
|
230 |
-
value=rd[0],
|
231 |
-
interactive=True
|
232 |
-
)
|
233 |
-
else:
|
234 |
-
sty=gr.Textbox(value="none", visible=False, interactive=False)
|
235 |
-
with gr.TabItem("Style using a different audio"):
|
236 |
-
with gr.Column():
|
237 |
-
ref_audio_path = gr.Audio(label="Reference Audio", type="filepath")
|
238 |
-
prompt_text = gr.Textbox(label="Reference Audio Text", interactive=True, placeholder="Leave blank to use no-text reference mode.")
|
239 |
-
prompt_language = gr.Dropdown(label="Reference Audio Language", choices=["EN", "JP", "ZH", "ZH/EN", "JP/EN", "Automatic"], value="EN")
|
240 |
-
with gr.Column():
|
241 |
-
inference_button = gr.Button("Synthesize", variant="primary")
|
242 |
-
output = gr.Audio(label="Output")
|
243 |
-
|
244 |
-
inference_button.click(
|
245 |
-
inference,
|
246 |
-
inputs=[n, gptmp, svmp, sty, text, text_language, ref_audio_path, prompt_text, prompt_language, top_k, top_p, temperature, how_to_cut, batch_size, speed_factor, split_bucket, fragment_interval, seed, keep_random, parallel_infer, repetition_penalty],
|
247 |
-
outputs=[output, seed]
|
248 |
-
)
|
249 |
-
|
250 |
-
#bottom info
|
251 |
-
with gr.Row():
|
252 |
-
with gr.Column():
|
253 |
-
text.render()
|
254 |
-
text_language.render()
|
255 |
-
how_to_cut.render()
|
256 |
-
with gr.Column():
|
257 |
-
temperature.render()
|
258 |
-
speed_factor.render()
|
259 |
-
with gr.Accordion("Advanced Inference Parameters", open=False):
|
260 |
-
top_k.render()
|
261 |
-
top_p.render()
|
262 |
-
batch_size.render()
|
263 |
-
fragment_interval.render()
|
264 |
-
repetition_penalty.render()
|
265 |
-
parallel_infer.render()
|
266 |
-
split_bucket.render()
|
267 |
-
seed.render()
|
268 |
-
keep_random.render()
|
269 |
-
|
270 |
-
|
271 |
-
app.queue().launch()
|
|
|
1 |
+
# Based on GPT-SoVITS-fast-inference by ChasonJiang
|
2 |
+
|
3 |
+
import random
|
4 |
+
import os
|
5 |
+
import torch
|
6 |
+
|
7 |
+
if torch.cuda.is_available():
|
8 |
+
device = "cuda"
|
9 |
+
else:
|
10 |
+
device = "cpu"
|
11 |
+
|
12 |
+
import re, logging
|
13 |
+
logging.getLogger("markdown_it").setLevel(logging.ERROR)
|
14 |
+
logging.getLogger("urllib3").setLevel(logging.ERROR)
|
15 |
+
logging.getLogger("httpcore").setLevel(logging.ERROR)
|
16 |
+
logging.getLogger("httpx").setLevel(logging.ERROR)
|
17 |
+
logging.getLogger("asyncio").setLevel(logging.ERROR)
|
18 |
+
logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
|
19 |
+
logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
|
20 |
+
import pdb
|
21 |
+
import json
|
22 |
+
|
23 |
+
infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
|
24 |
+
infer_ttswebui = int(infer_ttswebui)
|
25 |
+
is_share = os.environ.get("is_share", "False")
|
26 |
+
is_share = eval(is_share)
|
27 |
+
if "_CUDA_VISIBLE_DEVICES" in os.environ:
|
28 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
|
29 |
+
is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
|
30 |
+
gpt_path=None
|
31 |
+
sovits_path=None
|
32 |
+
#gpt_path = os.environ.get("gpt_path", None)
|
33 |
+
#sovits_path = os.environ.get("sovits_path", None)
|
34 |
+
cnhubert_base_path = os.environ.get("cnhubert_base_path", None)
|
35 |
+
bert_path = os.environ.get("bert_path", None)
|
36 |
+
|
37 |
+
import gradio as gr
|
38 |
+
from TTS_infer_pack.TTS import TTS, TTS_Config
|
39 |
+
from TTS_infer_pack.text_segmentation_method import get_method
|
40 |
+
|
41 |
+
import nltk
|
42 |
+
nltk.download('averaged_perceptron_tagger_eng')
|
43 |
+
|
44 |
+
dict_language = {
|
45 |
+
"ZH": "all_zh",#ε
¨ι¨ζδΈζθ―ε«
|
46 |
+
"EN": "en",#ε
¨ι¨ζθ±ζθ―ε«#######δΈε
|
47 |
+
"JP": "all_ja",#ε
¨ι¨ζζ₯ζθ―ε«
|
48 |
+
"ZH/EN": "zh",#ζδΈθ±ζ··εθ―ε«####δΈε
|
49 |
+
"JP/EN": "ja",#ζζ₯θ±ζ··εθ―ε«####δΈε
|
50 |
+
"Automatic": "auto",#ε€θ―η§ε―ε¨εεθ―ε«θ―η§
|
51 |
+
}
|
52 |
+
|
53 |
+
cut_method = {
|
54 |
+
"None":"cut0",
|
55 |
+
"4 Sentences": "cut1",
|
56 |
+
"50 Characters": "cut2",
|
57 |
+
"ZH/JP Punctuation": "cut3",
|
58 |
+
"EN Punctuation": "cut4",
|
59 |
+
"All Punctuation": "cut5",
|
60 |
+
}
|
61 |
+
|
62 |
+
tts_config = TTS_Config("GPT_SoVITS/configs/tts_infer.yaml")
|
63 |
+
tts_config.device = device
|
64 |
+
tts_config.is_half = is_half
|
65 |
+
if gpt_path is not None:
|
66 |
+
tts_config.t2s_weights_path = gpt_path
|
67 |
+
if sovits_path is not None:
|
68 |
+
tts_config.vits_weights_path = sovits_path
|
69 |
+
if cnhubert_base_path is not None:
|
70 |
+
tts_config.cnhuhbert_base_path = cnhubert_base_path
|
71 |
+
if bert_path is not None:
|
72 |
+
tts_config.bert_base_path = bert_path
|
73 |
+
|
74 |
+
print(tts_config)
|
75 |
+
tts_pipeline = TTS(tts_config)
|
76 |
+
gpt_path = tts_config.t2s_weights_path
|
77 |
+
sovits_path = tts_config.vits_weights_path
|
78 |
+
|
79 |
+
clm= ""
|
80 |
+
|
81 |
+
def inference(name, gptmp, svmp, sty, text, text_lang,
|
82 |
+
ref_audio_path, prompt_text,
|
83 |
+
prompt_lang, top_k,
|
84 |
+
top_p, temperature,
|
85 |
+
text_split_method, batch_size,
|
86 |
+
speed_factor,
|
87 |
+
split_bucket,fragment_interval,
|
88 |
+
seed, keep_random, parallel_infer,
|
89 |
+
repetition_penalty
|
90 |
+
):
|
91 |
+
|
92 |
+
global clm
|
93 |
+
#Live switching
|
94 |
+
if(not ref_audio_path):
|
95 |
+
ref_audio_path=f"referenceaudio/{name}/"+referencedata[name][0][sty]
|
96 |
+
prompt_text=referencedata[name][1][sty]
|
97 |
+
if clm!=name:
|
98 |
+
print(f"Switching to model {name}")
|
99 |
+
clm=name
|
100 |
+
tts_pipeline.init_t2s_weights(gptmp)
|
101 |
+
tts_pipeline.init_vits_weights(svmp)
|
102 |
+
|
103 |
+
seed = -1 if keep_random else seed
|
104 |
+
actual_seed = seed if seed not in [-1, "", None] else random.randrange(1 << 32)
|
105 |
+
print(f"TMP: {temperature} | SPDFCT: {speed_factor} | STY: {sty} | LANG: {text_lang}")
|
106 |
+
inputs={
|
107 |
+
"text": text,
|
108 |
+
"text_lang": dict_language[text_lang],
|
109 |
+
"ref_audio_path": ref_audio_path,
|
110 |
+
"prompt_text": prompt_text,
|
111 |
+
"prompt_lang": dict_language[prompt_lang],
|
112 |
+
"top_k": top_k,
|
113 |
+
"top_p": top_p,
|
114 |
+
"temperature": temperature,
|
115 |
+
"text_split_method": cut_method[text_split_method],
|
116 |
+
"batch_size":int(batch_size),
|
117 |
+
"speed_factor":float(speed_factor),
|
118 |
+
"split_bucket":split_bucket,
|
119 |
+
"return_fragment":False,
|
120 |
+
"fragment_interval":fragment_interval,
|
121 |
+
"seed":actual_seed,
|
122 |
+
"parallel_infer": parallel_infer,
|
123 |
+
"repetition_penalty": repetition_penalty,
|
124 |
+
}
|
125 |
+
for item in tts_pipeline.run(inputs):
|
126 |
+
yield item, actual_seed
|
127 |
+
|
128 |
+
def custom_sort_key(s):
|
129 |
+
# δ½Ώη¨ζ£ε葨达εΌζεε符串δΈηζ°ει¨εειζ°ει¨ε
|
130 |
+
parts = re.split('(\d+)', s)
|
131 |
+
# ε°ζ°ει¨ε转ζ’δΈΊζ΄ζ°οΌιζ°ει¨εδΏζδΈε
|
132 |
+
parts = [int(part) if part.isdigit() else part for part in parts]
|
133 |
+
return parts
|
134 |
+
|
135 |
+
|
136 |
+
def change_choices():
|
137 |
+
SoVITS_names, GPT_names = get_weights_names()
|
138 |
+
return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names, key=custom_sort_key), "__type__": "update"}
|
139 |
+
|
140 |
+
|
141 |
+
pretrained_sovits_name = "GPT_SoVITS/pretrained_models/s2G488k.pth"
|
142 |
+
pretrained_gpt_name = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
|
143 |
+
SoVITS_weight_root = "GPT_SoVITS/SoVITS_weights/"
|
144 |
+
GPT_weight_root = "GPT_SoVITS/GPT_weights/"
|
145 |
+
|
146 |
+
def get_weights_names():
|
147 |
+
SoVITS_names = [pretrained_sovits_name]
|
148 |
+
for name in os.listdir(SoVITS_weight_root):
|
149 |
+
if name.endswith(".pth"): SoVITS_names.append("%s/%s" % (SoVITS_weight_root, name))
|
150 |
+
GPT_names = [pretrained_gpt_name]
|
151 |
+
for name in os.listdir(GPT_weight_root):
|
152 |
+
if name.endswith(".ckpt"): GPT_names.append("%s/%s" % (GPT_weight_root, name))
|
153 |
+
return SoVITS_names, GPT_names
|
154 |
+
|
155 |
+
def load_models():
|
156 |
+
print("Loading models...")
|
157 |
+
voices=[]
|
158 |
+
ustyles={}
|
159 |
+
with open("voicelist.json", "r", encoding="utf-8") as f:
|
160 |
+
voc_info = json.load(f)
|
161 |
+
for name, info in voc_info.items():
|
162 |
+
if not info['enable']:
|
163 |
+
continue
|
164 |
+
title= info['title']
|
165 |
+
gptmodelpath= "%s/%s" % (GPT_weight_root, info['gpt_model_path'])
|
166 |
+
sovitsmodelpath= "%s/%s" % (SoVITS_weight_root, info['sovits_model_path'])
|
167 |
+
author= info['modelauthor']
|
168 |
+
image = info['cover']
|
169 |
+
styles = info['styles']
|
170 |
+
#check that all styles properly exist
|
171 |
+
for s in styles.values():
|
172 |
+
if(not os.path.exists(f"referenceaudio/{name}/{s}")):
|
173 |
+
print(f"WARNING : Some defined preset styles do not exist for model {name}, skipping")
|
174 |
+
styles=None
|
175 |
+
break
|
176 |
+
styletrans = info['styletrans']
|
177 |
+
st=[styles, styletrans]
|
178 |
+
voices.append((name, title, gptmodelpath, sovitsmodelpath, author, image))
|
179 |
+
ustyles[name]=st
|
180 |
+
print(f"Indexed model {title}")
|
181 |
+
return voices, ustyles
|
182 |
+
|
183 |
+
modeldata, referencedata = load_models()
|
184 |
+
|
185 |
+
#Gradio preload
|
186 |
+
text = gr.TextArea(label="Input Text", value="Hello there! This is test audio of a new text to speech tool.")
|
187 |
+
text_language = gr.Dropdown(label="Language", choices=["EN", "JP", "ZH", "ZH/EN", "JP/EN", "Automatic"], value="EN")
|
188 |
+
how_to_cut = gr.Dropdown(label="Slicing Method",
|
189 |
+
choices=["None", "4 Sentences", "50 Characters", "ZH/JP Punctuation", "EN Punctuation", "All Punctuation" ],
|
190 |
+
value="4 Sentences",
|
191 |
+
interactive=True,
|
192 |
+
)
|
193 |
+
top_k = gr.Slider(minimum=1,maximum=100,step=1,label="Top_k",value=5,interactive=True)
|
194 |
+
top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label="Top_p",value=1,interactive=True)
|
195 |
+
temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label="Temperature",value=0.7,interactive=True)
|
196 |
+
batch_size = gr.Slider(minimum=1,maximum=200,step=1,label="Batch Size",value=20,interactive=True)
|
197 |
+
fragment_interval = gr.Slider(minimum=0.01,maximum=1,step=0.01,label="Fragment Interval",value=0.3,interactive=True)
|
198 |
+
speed_factor = gr.Slider(minimum=0.50,maximum=2,step=0.05,label="Speed Factor",value=1.0,interactive=True)
|
199 |
+
repetition_penalty = gr.Slider(minimum=0,maximum=2,step=0.05,label="Repetition Penalty",value=1.35,interactive=True)
|
200 |
+
parallel_infer = gr.Checkbox(label="Parallel Infer", value=True, interactive=True, show_label=True)
|
201 |
+
split_bucket = gr.Checkbox(label="Split Bucket", value=True, interactive=True, show_label=True)
|
202 |
+
seed = gr.Number(label="Random Seed",value=-1, interactive=True, show_label=True)
|
203 |
+
keep_random = gr.Checkbox(label="Use Randomized Seed", value=True, interactive=True, show_label=True)
|
204 |
+
|
205 |
+
#Main gradio
|
206 |
+
with gr.Blocks(title="Lemonfoot GPT-SoVITS") as app:
|
207 |
+
gr.Markdown(
|
208 |
+
"# Lemonfoot GPT-SoVITS ππ\n"
|
209 |
+
"### Space by Kit Lemonfoot / Noel Shirogane's High Flying Birds\n"
|
210 |
+
"Based on code originally by RVC_Boss and ChasonJiang\n\n"
|
211 |
+
"Do no evil.\n\n"
|
212 |
+
)
|
213 |
+
for (name, title, gptmodelpath, sovitsmodelpath, author, image) in modeldata:
|
214 |
+
with gr.TabItem(name):
|
215 |
+
with gr.Row():
|
216 |
+
with gr.Column():
|
217 |
+
n = gr.Textbox(value=name, visible=False, interactive=False)
|
218 |
+
gptmp = gr.Textbox(value=gptmodelpath, visible=False, interactive=False)
|
219 |
+
svmp = gr.Textbox(value=sovitsmodelpath, visible=False, interactive=False)
|
220 |
+
gr.Markdown(f"**{title}**\n\n Dataset author: {author}")
|
221 |
+
gr.Image(f"images/{image}", label=None, show_label=False, width=300, show_download_button=False, container=False, show_share_button=False)
|
222 |
+
with gr.Column():
|
223 |
+
#if there isn't any styles don't bother rendering the style window
|
224 |
+
if(not referencedata[name][0]==None):
|
225 |
+
rd = list(referencedata[name][0].keys())
|
226 |
+
with gr.TabItem("Style using a preset"):
|
227 |
+
sty = gr.Dropdown(
|
228 |
+
label="Current style",
|
229 |
+
choices=rd,
|
230 |
+
value=rd[0],
|
231 |
+
interactive=True
|
232 |
+
)
|
233 |
+
else:
|
234 |
+
sty=gr.Textbox(value="none", visible=False, interactive=False)
|
235 |
+
with gr.TabItem("Style using a different audio"):
|
236 |
+
with gr.Column():
|
237 |
+
ref_audio_path = gr.Audio(label="Reference Audio", type="filepath")
|
238 |
+
prompt_text = gr.Textbox(label="Reference Audio Text", interactive=True, placeholder="Leave blank to use no-text reference mode.")
|
239 |
+
prompt_language = gr.Dropdown(label="Reference Audio Language", choices=["EN", "JP", "ZH", "ZH/EN", "JP/EN", "Automatic"], value="EN")
|
240 |
+
with gr.Column():
|
241 |
+
inference_button = gr.Button("Synthesize", variant="primary")
|
242 |
+
output = gr.Audio(label="Output")
|
243 |
+
|
244 |
+
inference_button.click(
|
245 |
+
inference,
|
246 |
+
inputs=[n, gptmp, svmp, sty, text, text_language, ref_audio_path, prompt_text, prompt_language, top_k, top_p, temperature, how_to_cut, batch_size, speed_factor, split_bucket, fragment_interval, seed, keep_random, parallel_infer, repetition_penalty],
|
247 |
+
outputs=[output, seed]
|
248 |
+
)
|
249 |
+
|
250 |
+
#bottom info
|
251 |
+
with gr.Row():
|
252 |
+
with gr.Column():
|
253 |
+
text.render()
|
254 |
+
text_language.render()
|
255 |
+
how_to_cut.render()
|
256 |
+
with gr.Column():
|
257 |
+
temperature.render()
|
258 |
+
speed_factor.render()
|
259 |
+
with gr.Accordion("Advanced Inference Parameters", open=False):
|
260 |
+
top_k.render()
|
261 |
+
top_p.render()
|
262 |
+
batch_size.render()
|
263 |
+
fragment_interval.render()
|
264 |
+
repetition_penalty.render()
|
265 |
+
parallel_infer.render()
|
266 |
+
split_bucket.render()
|
267 |
+
seed.render()
|
268 |
+
keep_random.render()
|
269 |
+
|
270 |
+
|
271 |
+
app.queue().launch()
|