Spaces:
Running
on
Zero
Running
on
Zero
Update inference_webui.py
Browse files- inference_webui.py +42 -57
inference_webui.py
CHANGED
@@ -84,22 +84,15 @@ from module.mel_processing import spectrogram_torch
|
|
84 |
from module.models import SynthesizerTrn
|
85 |
from text import cleaned_text_to_sequence
|
86 |
from text.cleaner import clean_text
|
87 |
-
|
88 |
from tools.my_utils import load_audio
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
|
94 |
# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。
|
95 |
|
96 |
-
i18n_dict={}
|
97 |
-
json_root="tools/i18n/locale"
|
98 |
-
for name in os.listdir(json_root):
|
99 |
-
with open("%s/%s"%(json_root,name),"r")as f:
|
100 |
-
data=json.loads(f.read())
|
101 |
-
i18n_dict[name.split(".json")[0].replace("_","-")]=data
|
102 |
-
i18n=gr.I18n(**i18n_dict)
|
103 |
|
104 |
if torch.cuda.is_available():
|
105 |
device = "cuda"
|
@@ -108,34 +101,26 @@ else:
|
|
108 |
device = "cpu"
|
109 |
is_half = False
|
110 |
|
111 |
-
# i18n_dict={}
|
112 |
-
# json_root="tools/i18n/locale"
|
113 |
-
# for name in os.listdir(json_root):
|
114 |
-
# with open("%s/%s"%(json_root,name),"r")as f:
|
115 |
-
# data=json.loads(f.read())
|
116 |
-
# i18n_dict[name.split(".json")[0].replace("_","-")]=data
|
117 |
-
# i18n=gr.I18n(**i18n_dict)
|
118 |
-
|
119 |
dict_language_v1 = {
|
120 |
-
i18n("中文")
|
121 |
-
i18n("英文")
|
122 |
-
i18n("日文")
|
123 |
-
i18n("中英混合")
|
124 |
-
i18n("日英混合")
|
125 |
-
i18n("多语种混合")
|
126 |
}
|
127 |
dict_language_v2 = {
|
128 |
-
i18n("中文")
|
129 |
-
i18n("英文")
|
130 |
-
i18n("日文")
|
131 |
-
i18n("粤语")
|
132 |
-
i18n("韩文")
|
133 |
-
i18n("中英混合")
|
134 |
-
i18n("日英混合")
|
135 |
-
i18n("粤英混合")
|
136 |
-
i18n("韩英混合")
|
137 |
-
i18n("多语种混合")
|
138 |
-
i18n("多语种混合(粤语)")
|
139 |
}
|
140 |
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
|
141 |
|
@@ -226,7 +211,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
|
|
226 |
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
|
227 |
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
|
228 |
if prompt_language is not None and text_language is not None:
|
229 |
-
if prompt_language in list(
|
230 |
prompt_text_update, prompt_language_update = (
|
231 |
{"__type__": "update"},
|
232 |
{"__type__": "update", "value": prompt_language},
|
@@ -234,14 +219,14 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
|
|
234 |
else:
|
235 |
prompt_text_update = {"__type__": "update", "value": ""}
|
236 |
prompt_language_update = {"__type__": "update", "value": i18n("中文")}
|
237 |
-
if text_language in list(
|
238 |
text_update, text_language_update = {"__type__": "update"}, {"__type__": "update", "value": text_language}
|
239 |
else:
|
240 |
text_update = {"__type__": "update", "value": ""}
|
241 |
text_language_update = {"__type__": "update", "value": i18n("中文")}
|
242 |
return (
|
243 |
-
{"__type__": "update", "choices": list(
|
244 |
-
{"__type__": "update", "choices": list(
|
245 |
prompt_text_update,
|
246 |
prompt_language_update,
|
247 |
text_update,
|
@@ -483,12 +468,12 @@ def get_tts_wav(
|
|
483 |
prompt_text = prompt_text.strip("\n")
|
484 |
if prompt_text[-1] not in splits:
|
485 |
prompt_text += "。" if prompt_language != "en" else "."
|
486 |
-
print(i18n("实际输入的参考文本:")
|
487 |
text = text.strip("\n")
|
488 |
if text[0] not in splits and len(get_first(text)) < 4:
|
489 |
text = "。" + text if text_language != "en" else "." + text
|
490 |
|
491 |
-
print(i18n("实际输入的目标文本:")
|
492 |
zero_wav = np.zeros(
|
493 |
int(hps.data.sampling_rate * 0.3),
|
494 |
dtype=np.float16 if is_half == True else np.float32,
|
@@ -532,7 +517,7 @@ def get_tts_wav(
|
|
532 |
text = cut5(text)
|
533 |
while "\n\n" in text:
|
534 |
text = text.replace("\n\n", "\n")
|
535 |
-
print(i18n("实际输入的目标文本(切句后):")
|
536 |
texts = text.split("\n")
|
537 |
texts = process_text(texts)
|
538 |
texts = merge_short_text_in_array(texts, 5)
|
@@ -548,9 +533,9 @@ def get_tts_wav(
|
|
548 |
continue
|
549 |
if text[-1] not in splits:
|
550 |
text += "。" if text_language != "en" else "."
|
551 |
-
print(i18n("实际输入的目标文本(每句):")
|
552 |
phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language, version)
|
553 |
-
print(i18n("前端处理后的文本(每句):")
|
554 |
if not ref_free:
|
555 |
bert = torch.cat([bert1, bert2], 1)
|
556 |
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
|
@@ -563,7 +548,7 @@ def get_tts_wav(
|
|
563 |
|
564 |
t2 = ttime()
|
565 |
# cache_key="%s-%s-%s-%s-%s-%s-%s-%s"%(ref_wav_path,prompt_text,prompt_language,text,text_language,top_k,top_p,temperature)
|
566 |
-
# print(
|
567 |
if i_text in cache and if_freeze == True:
|
568 |
pred_semantic = cache[i_text]
|
569 |
else:
|
@@ -813,7 +798,7 @@ with gr.Blocks(
|
|
813 |
),
|
814 |
)
|
815 |
prompt_language = gr.Dropdown(
|
816 |
-
label=i18n("参考音频的语种"), choices=list(
|
817 |
)
|
818 |
inp_refs = gr.File(
|
819 |
label=i18n(
|
@@ -828,18 +813,18 @@ with gr.Blocks(
|
|
828 |
with gr.Column():
|
829 |
text_language = gr.Dropdown(
|
830 |
label=i18n("需要合成的语种。限制范围越小判别效果越好。"),
|
831 |
-
choices=list(
|
832 |
value=i18n("中文"),
|
833 |
)
|
834 |
how_to_cut = gr.Dropdown(
|
835 |
label=i18n("怎么切"),
|
836 |
choices=[
|
837 |
-
i18n("不切")
|
838 |
-
i18n("凑四句一切")
|
839 |
-
i18n("凑50字一切")
|
840 |
-
i18n("按中文句号。切")
|
841 |
-
i18n("按英文句号.切")
|
842 |
-
i18n("按标点符号切")
|
843 |
],
|
844 |
value=i18n("凑四句一切"),
|
845 |
interactive=True,
|
@@ -907,9 +892,9 @@ if __name__ == "__main__":
|
|
907 |
gen = get_tts_wav(
|
908 |
ref_wav_path=file_name,
|
909 |
prompt_text="",
|
910 |
-
prompt_language=i18n("中文")
|
911 |
text="犯大吴疆土者,盛必击而破之,犯大吴疆土者,盛必击而破之,犯大吴疆土者,盛必击而破之,犯大吴疆土者,盛必击而破之.你好世界 Love you 世界へ 안녕하세요",
|
912 |
-
text_language=i18n("多语种混合")
|
913 |
inp_refs=[],
|
914 |
)
|
915 |
next(gen)
|
@@ -918,5 +903,5 @@ if __name__ == "__main__":
|
|
918 |
server_name="0.0.0.0",
|
919 |
inbrowser=True,
|
920 |
show_api=False,
|
921 |
-
allowed_paths=["/"]
|
922 |
)
|
|
|
84 |
from module.models import SynthesizerTrn
|
85 |
from text import cleaned_text_to_sequence
|
86 |
from text.cleaner import clean_text
|
87 |
+
from tools.i18n.i18n import I18nAuto, scan_language_list
|
88 |
from tools.my_utils import load_audio
|
89 |
|
90 |
+
language=os.environ.get("language","Auto")
|
91 |
+
language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
92 |
+
i18n = I18nAuto(language="Auto")
|
93 |
|
94 |
# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。
|
95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
if torch.cuda.is_available():
|
98 |
device = "cuda"
|
|
|
101 |
device = "cpu"
|
102 |
is_half = False
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
dict_language_v1 = {
|
105 |
+
i18n("中文"): "all_zh", # 全部按中文识别
|
106 |
+
i18n("英文"): "en", # 全部按英文识别#######不变
|
107 |
+
i18n("日文"): "all_ja", # 全部按日文识别
|
108 |
+
i18n("中英混合"): "zh", # 按中英混合识别####不变
|
109 |
+
i18n("日英混合"): "ja", # 按日英混合识别####不变
|
110 |
+
i18n("多语种混合"): "auto", # 多语种启动切分识别语种
|
111 |
}
|
112 |
dict_language_v2 = {
|
113 |
+
i18n("中文"): "all_zh", # 全部按中文识别
|
114 |
+
i18n("英文"): "en", # 全部按英文识别#######不变
|
115 |
+
i18n("日文"): "all_ja", # 全部按日文识别
|
116 |
+
i18n("粤语"): "all_yue", # 全部按中文识别
|
117 |
+
i18n("韩文"): "all_ko", # 全部按韩文识别
|
118 |
+
i18n("中英混合"): "zh", # 按中英混合识别####不变
|
119 |
+
i18n("日英混合"): "ja", # 按日英混合识别####不变
|
120 |
+
i18n("粤英混合"): "yue", # 按粤英混合识别####不变
|
121 |
+
i18n("韩英混合"): "ko", # 按韩英混合识别####不变
|
122 |
+
i18n("多语种混合"): "auto", # 多语种启动切分识别语种
|
123 |
+
i18n("多语种混合(粤语)"): "auto_yue", # 多语种启动切分识别语种
|
124 |
}
|
125 |
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
|
126 |
|
|
|
211 |
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
|
212 |
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
|
213 |
if prompt_language is not None and text_language is not None:
|
214 |
+
if prompt_language in list(dict_languages()):
|
215 |
prompt_text_update, prompt_language_update = (
|
216 |
{"__type__": "update"},
|
217 |
{"__type__": "update", "value": prompt_language},
|
|
|
219 |
else:
|
220 |
prompt_text_update = {"__type__": "update", "value": ""}
|
221 |
prompt_language_update = {"__type__": "update", "value": i18n("中文")}
|
222 |
+
if text_language in list(dict_languages()):
|
223 |
text_update, text_language_update = {"__type__": "update"}, {"__type__": "update", "value": text_language}
|
224 |
else:
|
225 |
text_update = {"__type__": "update", "value": ""}
|
226 |
text_language_update = {"__type__": "update", "value": i18n("中文")}
|
227 |
return (
|
228 |
+
{"__type__": "update", "choices": list(dict_languages())},
|
229 |
+
{"__type__": "update", "choices": list(dict_languages())},
|
230 |
prompt_text_update,
|
231 |
prompt_language_update,
|
232 |
text_update,
|
|
|
468 |
prompt_text = prompt_text.strip("\n")
|
469 |
if prompt_text[-1] not in splits:
|
470 |
prompt_text += "。" if prompt_language != "en" else "."
|
471 |
+
print(i18n("实际输入的参考文本:"), prompt_text)
|
472 |
text = text.strip("\n")
|
473 |
if text[0] not in splits and len(get_first(text)) < 4:
|
474 |
text = "。" + text if text_language != "en" else "." + text
|
475 |
|
476 |
+
print(i18n("实际输入的目标文本:"), text)
|
477 |
zero_wav = np.zeros(
|
478 |
int(hps.data.sampling_rate * 0.3),
|
479 |
dtype=np.float16 if is_half == True else np.float32,
|
|
|
517 |
text = cut5(text)
|
518 |
while "\n\n" in text:
|
519 |
text = text.replace("\n\n", "\n")
|
520 |
+
print(i18n("实际输入的目标文本(切句后):"), text)
|
521 |
texts = text.split("\n")
|
522 |
texts = process_text(texts)
|
523 |
texts = merge_short_text_in_array(texts, 5)
|
|
|
533 |
continue
|
534 |
if text[-1] not in splits:
|
535 |
text += "。" if text_language != "en" else "."
|
536 |
+
print(i18n("实际输入的目标文本(每句):"), text)
|
537 |
phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language, version)
|
538 |
+
print(i18n("前端处理后的文本(每句):"), norm_text2)
|
539 |
if not ref_free:
|
540 |
bert = torch.cat([bert1, bert2], 1)
|
541 |
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
|
|
|
548 |
|
549 |
t2 = ttime()
|
550 |
# cache_key="%s-%s-%s-%s-%s-%s-%s-%s"%(ref_wav_path,prompt_text,prompt_language,text,text_language,top_k,top_p,temperature)
|
551 |
+
# print(caches(),if_freeze)
|
552 |
if i_text in cache and if_freeze == True:
|
553 |
pred_semantic = cache[i_text]
|
554 |
else:
|
|
|
798 |
),
|
799 |
)
|
800 |
prompt_language = gr.Dropdown(
|
801 |
+
label=i18n("参考音频的语种"), choices=list(dict_languages()), value=i18n("中文")
|
802 |
)
|
803 |
inp_refs = gr.File(
|
804 |
label=i18n(
|
|
|
813 |
with gr.Column():
|
814 |
text_language = gr.Dropdown(
|
815 |
label=i18n("需要合成的语种。限制范围越小判别效果越好。"),
|
816 |
+
choices=list(dict_languages()),
|
817 |
value=i18n("中文"),
|
818 |
)
|
819 |
how_to_cut = gr.Dropdown(
|
820 |
label=i18n("怎么切"),
|
821 |
choices=[
|
822 |
+
i18n("不切"),
|
823 |
+
i18n("凑四句一切"),
|
824 |
+
i18n("凑50字一切"),
|
825 |
+
i18n("按中文句号。切"),
|
826 |
+
i18n("按英文句号.切"),
|
827 |
+
i18n("按标点符号切"),
|
828 |
],
|
829 |
value=i18n("凑四句一切"),
|
830 |
interactive=True,
|
|
|
892 |
gen = get_tts_wav(
|
893 |
ref_wav_path=file_name,
|
894 |
prompt_text="",
|
895 |
+
prompt_language=i18n("中文"),
|
896 |
text="犯大吴疆土者,盛必击而破之,犯大吴疆土者,盛必击而破之,犯大吴疆土者,盛必击而破之,犯大吴疆土者,盛必击而破之.你好世界 Love you 世界へ 안녕하세요",
|
897 |
+
text_language=i18n("多语种混合"),
|
898 |
inp_refs=[],
|
899 |
)
|
900 |
next(gen)
|
|
|
903 |
server_name="0.0.0.0",
|
904 |
inbrowser=True,
|
905 |
show_api=False,
|
906 |
+
allowed_paths=["/"]#,i18n=i18n
|
907 |
)
|