NeoPy commited on
Commit
07323c8
·
verified ·
1 Parent(s): 661e8f5

Create tabs/inference/inference.py

Browse files
Files changed (1) hide show
  1. main/app/tabs/inference/inference.py +597 -0
main/app/tabs/inference/inference.py ADDED
@@ -0,0 +1,597 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from main.tools import huggingface
3
+ from main.configs.config import Config
4
+ from main.app.based.utils import *
5
+
6
+ def inference_tabs():
7
+ # Audio Conversion Tab
8
+ with gr.TabItem(translations["convert_audio"], visible=configs.get("convert_tab", True)):
9
+ gr.Markdown(f"## {translations['convert_audio']}")
10
+ with gr.Row():
11
+ gr.Markdown(translations["convert_info"])
12
+
13
+ with gr.Row():
14
+ with gr.Column():
15
+ with gr.Accordion(translations["model_accordion"], open=True):
16
+ with gr.Row():
17
+ model_pth = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
18
+ model_index = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
19
+ refesh = gr.Button(translations["refesh"])
20
+
21
+ with gr.Row():
22
+ with gr.Column():
23
+ audio_select = gr.Dropdown(label=translations["select_separate"], choices=[], value="", interactive=True, allow_custom_value=True, visible=False)
24
+ convert_button_2 = gr.Button(translations["convert_audio"], visible=False)
25
+ with gr.Row():
26
+ with gr.Column():
27
+ input0 = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"])
28
+ play_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
29
+ with gr.Row():
30
+ with gr.Column():
31
+ with gr.Row():
32
+ index_strength = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index.value != "")
33
+ with gr.Row():
34
+ with gr.Column():
35
+ with gr.Accordion(translations["input_output"], open=False):
36
+ with gr.Column():
37
+ export_format = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True)
38
+ input_audio0 = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
39
+ output_audio = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True)
40
+ with gr.Column():
41
+ refesh0 = gr.Button(translations["refesh"])
42
+ with gr.Accordion(translations["setting"], open=False):
43
+ with gr.Row():
44
+ cleaner0 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
45
+ autotune = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
46
+ use_audio = gr.Checkbox(label=translations["use_audio"], value=False, interactive=True)
47
+ checkpointing = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
48
+ with gr.Row():
49
+ use_original = gr.Checkbox(label=translations["convert_original"], value=False, interactive=True, visible=use_audio.value)
50
+ convert_backing = gr.Checkbox(label=translations["convert_backing"], value=False, interactive=True, visible=use_audio.value)
51
+ not_merge_backing = gr.Checkbox(label=translations["not_merge_backing"], value=False, interactive=True, visible=use_audio.value)
52
+ merge_instrument = gr.Checkbox(label=translations["merge_instruments"], value=False, interactive=True, visible=use_audio.value)
53
+ with gr.Row():
54
+ pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
55
+ clean_strength0 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner0.value)
56
+
57
+ with gr.Accordion(translations["f0_method"], open=False):
58
+ with gr.Group():
59
+ with gr.Row():
60
+ onnx_f0_mode = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
61
+ unlock_full_method = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
62
+ method = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0+["hybrid"], value="rmvpe", interactive=True)
63
+ hybrid_method = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=["hybrid[pm+dio]", "hybrid[pm+crepe-tiny]", "hybrid[pm+crepe]", "hybrid[pm+fcpe]", "hybrid[pm+rmvpe]", "hybrid[pm+harvest]", "hybrid[pm+yin]", "hybrid[dio+crepe-tiny]", "hybrid[dio+crepe]", "hybrid[dio+fcpe]", "hybrid[dio+rmvpe]", "hybrid[dio+harvest]", "hybrid[dio+yin]", "hybrid[crepe-tiny+crepe]", "hybrid[crepe-tiny+fcpe]", "hybrid[crepe-tiny+rmvpe]", "hybrid[crepe-tiny+harvest]", "hybrid[crepe+fcpe]", "hybrid[crepe+rmvpe]", "hybrid[crepe+harvest]", "hybrid[crepe+yin]", "hybrid[fcpe+rmvpe]", "hybrid[fcpe+harvest]", "hybrid[fcpe+yin]", "hybrid[rmvpe+harvest]", "hybrid[rmvpe+yin]", "hybrid[harvest+yin]"], value="hybrid[pm+dio]", interactive=True, allow_custom_value=True, visible=method.value == "hybrid")
64
+ hop_length = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=512, value=128, step=1, interactive=True, visible=False)
65
+ with gr.Accordion(translations["f0_file"], open=False):
66
+ upload_f0_file = gr.File(label=translations["upload_f0"], file_types=[".txt"])
67
+ f0_file_dropdown = gr.Dropdown(label=translations["f0_file_2"], value="", choices=f0_file, allow_custom_value=True, interactive=True)
68
+ refesh_f0_file = gr.Button(translations["refesh"])
69
+ with gr.Accordion(translations["hubert_model"], open=False):
70
+ embed_mode = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
71
+ embedders = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
72
+ custom_embedders = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders.value == "custom")
73
+ with gr.Accordion(translations["use_presets"], open=False):
74
+ with gr.Row():
75
+ presets_name = gr.Dropdown(label=translations["file_preset"], choices=presets_file, value=presets_file[0] if len(presets_file) > 0 else '', interactive=True, allow_custom_value=True)
76
+ with gr.Row():
77
+ load_click = gr.Button(translations["load_file"], variant="primary")
78
+ refesh_click = gr.Button(translations["refesh"])
79
+ with gr.Accordion(translations["export_file"], open=False):
80
+ with gr.Row():
81
+ with gr.Column():
82
+ with gr.Group():
83
+ with gr.Row():
84
+ cleaner_chbox = gr.Checkbox(label=translations["save_clean"], value=True, interactive=True)
85
+ autotune_chbox = gr.Checkbox(label=translations["save_autotune"], value=True, interactive=True)
86
+ pitch_chbox = gr.Checkbox(label=translations["save_pitch"], value=True, interactive=True)
87
+ index_strength_chbox = gr.Checkbox(label=translations["save_index_2"], value=True, interactive=True)
88
+ resample_sr_chbox = gr.Checkbox(label=translations["save_resample"], value=True, interactive=True)
89
+ filter_radius_chbox = gr.Checkbox(label=translations["save_filter"], value=True, interactive=True)
90
+ volume_envelope_chbox = gr.Checkbox(label=translations["save_envelope"], value=True, interactive=True)
91
+ protect_chbox = gr.Checkbox(label=translations["save_protect"], value=True, interactive=True)
92
+ split_audio_chbox = gr.Checkbox(label=translations["save_split"], value=True, interactive=True)
93
+ formant_shifting_chbox = gr.Checkbox(label=translations["formantshift"], value=True, interactive=True)
94
+ with gr.Row():
95
+ with gr.Column():
96
+ name_to_save_file = gr.Textbox(label=translations["filename_to_save"])
97
+ save_file_button = gr.Button(translations["export_file"])
98
+ with gr.Row():
99
+ upload_presets = gr.File(label=translations["upload_presets"], file_types=[".json"])
100
+ with gr.Column():
101
+ with gr.Row():
102
+ split_audio = gr.Checkbox(label=translations["split_audio"], value=False, interactive=True)
103
+ formant_shifting = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True)
104
+ f0_autotune_strength = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune.value)
105
+ resample_sr = gr.Slider(minimum=0, maximum=96000, label=translations["resample"], info=translations["resample_info"], value=0, step=1, interactive=True)
106
+ filter_radius = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
107
+ volume_envelope = gr.Slider(minimum=0, maximum=1, label=translations["volume_envelope"], info=translations["volume_envelope_info"], value=1, step=0.1, interactive=True)
108
+ protect = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
109
+ with gr.Row():
110
+ formant_qfrency = gr.Slider(value=1.0, label=translations["formant_qfrency"], info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
111
+ formant_timbre = gr.Slider(value=1.0, label=translations["formant_timbre"], info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
112
+ with gr.Row():
113
+ convert_button = gr.Button(translations["convert_audio"], variant="primary")
114
+ gr.Markdown(translations["output_convert"])
115
+ with gr.Row():
116
+ main_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["main_convert"])
117
+ backing_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["convert_backing"], visible=convert_backing.value)
118
+ main_backing = gr.Audio(show_download_button=True, interactive=False, label=translations["main_or_backing"], visible=convert_backing.value)
119
+ with gr.Row():
120
+ original_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["convert_original"], visible=use_original.value)
121
+ vocal_instrument = gr.Audio(show_download_button=True, interactive=False, label=translations["voice_or_instruments"], visible=merge_instrument.value)
122
+ with gr.Row():
123
+ upload_f0_file.upload(fn=lambda inp: shutil.move(inp.name, os.path.join("assets", "f0")), inputs=[upload_f0_file], outputs=[f0_file_dropdown])
124
+ refesh_f0_file.click(fn=change_f0_choices, inputs=[], outputs=[f0_file_dropdown])
125
+ unlock_full_method.change(fn=unlock_f0, inputs=[unlock_full_method], outputs=[method])
126
+ with gr.Row():
127
+ load_click.click(
128
+ fn=load_presets,
129
+ inputs=[
130
+ presets_name,
131
+ cleaner0,
132
+ autotune,
133
+ pitch,
134
+ clean_strength0,
135
+ index_strength,
136
+ resample_sr,
137
+ filter_radius,
138
+ volume_envelope,
139
+ protect,
140
+ split_audio,
141
+ f0_autotune_strength,
142
+ formant_qfrency,
143
+ formant_timbre
144
+ ],
145
+ outputs=[
146
+ cleaner0,
147
+ autotune,
148
+ pitch,
149
+ clean_strength0,
150
+ index_strength,
151
+ resample_sr,
152
+ filter_radius,
153
+ volume_envelope,
154
+ protect,
155
+ split_audio,
156
+ f0_autotune_strength,
157
+ formant_shifting,
158
+ formant_qfrency,
159
+ formant_timbre
160
+ ]
161
+ )
162
+ refesh_click.click(fn=change_preset_choices, inputs=[], outputs=[presets_name])
163
+ save_file_button.click(
164
+ fn=save_presets,
165
+ inputs=[
166
+ name_to_save_file,
167
+ cleaner0,
168
+ autotune,
169
+ pitch,
170
+ clean_strength0,
171
+ index_strength,
172
+ resample_sr,
173
+ filter_radius,
174
+ volume_envelope,
175
+ protect,
176
+ split_audio,
177
+ f0_autotune_strength,
178
+ cleaner_chbox,
179
+ autotune_chbox,
180
+ pitch_chbox,
181
+ index_strength_chbox,
182
+ resample_sr_chbox,
183
+ filter_radius_chbox,
184
+ volume_envelope_chbox,
185
+ protect_chbox,
186
+ split_audio_chbox,
187
+ formant_shifting_chbox,
188
+ formant_shifting,
189
+ formant_qfrency,
190
+ formant_timbre
191
+ ],
192
+ outputs=[presets_name]
193
+ )
194
+ with gr.Row():
195
+ upload_presets.upload(fn=lambda audio_in: shutil.move(audio_in.name, os.path.join("assets", "presets")), inputs=[upload_presets], outputs=[presets_name])
196
+ autotune.change(fn=visible, inputs=[autotune], outputs=[f0_autotune_strength])
197
+ use_audio.change(fn=lambda a: [visible(a), visible(a), visible(a), visible(a), visible(a), valueFalse_interactive(a), valueFalse_interactive(a), valueFalse_interactive(a), valueFalse_interactive(a), visible(not a), visible(not a), visible(not a), visible(not a)], inputs=[use_audio], outputs=[main_backing, use_original, convert_backing, not_merge_backing, merge_instrument, use_original, convert_backing, not_merge_backing, merge_instrument, input_audio0, output_audio, input0, play_audio])
198
+ with gr.Row():
199
+ convert_backing.change(fn=lambda a,b: [change_backing_choices(a, b), visible(a)], inputs=[convert_backing, not_merge_backing], outputs=[use_original, backing_convert])
200
+ use_original.change(fn=lambda audio, original: [visible(original), visible(not original), visible(audio and not original), valueFalse_interactive(not original), valueFalse_interactive(not original)], inputs=[use_audio, use_original], outputs=[original_convert, main_convert, main_backing, convert_backing, not_merge_backing])
201
+ cleaner0.change(fn=visible, inputs=[cleaner0], outputs=[clean_strength0])
202
+ with gr.Row():
203
+ merge_instrument.change(fn=visible, inputs=[merge_instrument], outputs=[vocal_instrument])
204
+ not_merge_backing.change(fn=lambda audio, merge, cvb: [visible(audio and not merge), change_backing_choices(cvb, merge)], inputs=[use_audio, not_merge_backing, convert_backing], outputs=[main_backing, use_original])
205
+ method.change(fn=lambda method, hybrid: [visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method, hybrid_method], outputs=[hybrid_method, hop_length])
206
+ with gr.Row():
207
+ hybrid_method.change(fn=hoplength_show, inputs=[method, hybrid_method], outputs=[hop_length])
208
+ refesh.click(fn=change_models_choices, inputs=[], outputs=[model_pth, model_index])
209
+ model_pth.change(fn=get_index, inputs=[model_pth], outputs=[model_index])
210
+ with gr.Row():
211
+ input0.upload(fn=lambda audio_in: shutil.move(audio_in.name, os.path.join("audios")), inputs=[input0], outputs=[input_audio0])
212
+ input_audio0.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio0], outputs=[play_audio])
213
+ formant_shifting.change(fn=lambda a: [visible(a)]*2, inputs=[formant_shifting], outputs=[formant_qfrency, formant_timbre])
214
+ with gr.Row():
215
+ embedders.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders], outputs=[custom_embedders])
216
+ refesh0.click(fn=change_audios_choices, inputs=[input_audio0], outputs=[input_audio0])
217
+ model_index.change(fn=index_strength_show, inputs=[model_index], outputs=[index_strength])
218
+ with gr.Row():
219
+ audio_select.change(fn=lambda: visible(True), inputs=[], outputs=[convert_button_2])
220
+ convert_button.click(fn=lambda: visible(False), inputs=[], outputs=[convert_button])
221
+ convert_button_2.click(fn=lambda: [visible(False), visible(False)], inputs=[], outputs=[audio_select, convert_button_2])
222
+ with gr.Row():
223
+ convert_button.click(
224
+ fn=convert_selection,
225
+ inputs=[
226
+ cleaner0,
227
+ autotune,
228
+ use_audio,
229
+ use_original,
230
+ convert_backing,
231
+ not_merge_backing,
232
+ merge_instrument,
233
+ pitch,
234
+ clean_strength0,
235
+ model_pth,
236
+ model_index,
237
+ index_strength,
238
+ input_audio0,
239
+ output_audio,
240
+ export_format,
241
+ method,
242
+ hybrid_method,
243
+ hop_length,
244
+ embedders,
245
+ custom_embedders,
246
+ resample_sr,
247
+ filter_radius,
248
+ volume_envelope,
249
+ protect,
250
+ split_audio,
251
+ f0_autotune_strength,
252
+ checkpointing,
253
+ onnx_f0_mode,
254
+ formant_shifting,
255
+ formant_qfrency,
256
+ formant_timbre,
257
+ f0_file_dropdown,
258
+ embed_mode
259
+ ],
260
+ outputs=[audio_select, main_convert, backing_convert, main_backing, original_convert, vocal_instrument, convert_button],
261
+ api_name="convert_selection"
262
+ )
263
+ embed_mode.change(fn=visible_embedders, inputs=[embed_mode], outputs=[embedders])
264
+ convert_button_2.click(
265
+ fn=convert_audio,
266
+ inputs=[
267
+ cleaner0,
268
+ autotune,
269
+ use_audio,
270
+ use_original,
271
+ convert_backing,
272
+ not_merge_backing,
273
+ merge_instrument,
274
+ pitch,
275
+ clean_strength0,
276
+ model_pth,
277
+ model_index,
278
+ index_strength,
279
+ input_audio0,
280
+ output_audio,
281
+ export_format,
282
+ method,
283
+ hybrid_method,
284
+ hop_length,
285
+ embedders,
286
+ custom_embedders,
287
+ resample_sr,
288
+ filter_radius,
289
+ volume_envelope,
290
+ protect,
291
+ split_audio,
292
+ f0_autotune_strength,
293
+ audio_select,
294
+ checkpointing,
295
+ onnx_f0_mode,
296
+ formant_shifting,
297
+ formant_qfrency,
298
+ formant_timbre,
299
+ f0_file_dropdown,
300
+ embed_mode
301
+ ],
302
+ outputs=[main_convert, backing_convert, main_backing, original_convert, vocal_instrument, convert_button],
303
+ api_name="convert_audio"
304
+ )
305
+
306
+ # Text-to-Speech Conversion Tab
307
+ with gr.TabItem(translations["convert_text"], visible=configs.get("tts_tab", True)):
308
+ gr.Markdown(translations["convert_text_markdown"])
309
+ with gr.Row():
310
+ gr.Markdown(translations["convert_text_markdown_2"])
311
+ with gr.Accordion(translations["model_accordion"], open=True):
312
+ with gr.Row():
313
+ model_pth0 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
314
+ model_index0 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
315
+
316
+ with gr.Row():
317
+ with gr.Column():
318
+ with gr.Group():
319
+ with gr.Row():
320
+ use_txt = gr.Checkbox(label=translations["input_txt"], value=False, interactive=True)
321
+ google_tts_check_box = gr.Checkbox(label=translations["googletts"], value=False, interactive=True)
322
+ prompt = gr.Textbox(label=translations["text_to_speech"], value="", placeholder="Hello Words", lines=3)
323
+ with gr.Column():
324
+ speed = gr.Slider(label=translations["voice_speed"], info=translations["voice_speed_info"], minimum=-100, maximum=100, value=0, step=1)
325
+ pitch0 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
326
+ with gr.Row():
327
+ tts_button = gr.Button(translations["tts_1"], variant="primary", scale=2)
328
+ convert_button0 = gr.Button(translations["tts_2"], variant="secondary", scale=2)
329
+ with gr.Row():
330
+ with gr.Column():
331
+ txt_input = gr.File(label=translations["drop_text"], file_types=[".txt", ".srt"], visible=use_txt.value)
332
+ tts_voice = gr.Dropdown(label=translations["voice"], choices=edgetts, interactive=True, value="vi-VN-NamMinhNeural")
333
+ tts_pitch = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info_2"], label=translations["pitch"], value=0, interactive=True)
334
+ with gr.Column():
335
+ refesh1 = gr.Button(translations["refesh"])
336
+ with gr.Row():
337
+ index_strength0 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index0.value != "")
338
+ with gr.Accordion(translations["output_path"], open=False):
339
+ export_format0 = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True)
340
+ output_audio0 = gr.Textbox(label=translations["output_tts"], value="audios/tts.wav", placeholder="audios/tts.wav", info=translations["tts_output"], interactive=True)
341
+ output_audio1 = gr.Textbox(label=translations["output_tts_convert"], value="audios/tts-convert.wav", placeholder="audios/tts-convert.wav", info=translations["tts_output"], interactive=True)
342
+ with gr.Accordion(translations["setting"], open=False):
343
+ with gr.Accordion(translations["f0_method"], open=False):
344
+ with gr.Group():
345
+ with gr.Row():
346
+ onnx_f0_mode1 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
347
+ unlock_full_method3 = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
348
+ method0 = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0+["hybrid"], value="rmvpe", interactive=True)
349
+ hybrid_method0 = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=["hybrid[pm+dio]", "hybrid[pm+crepe-tiny]", "hybrid[pm+crepe]", "hybrid[pm+fcpe]", "hybrid[pm+rmvpe]", "hybrid[pm+harvest]", "hybrid[pm+yin]", "hybrid[dio+crepe-tiny]", "hybrid[dio+crepe]", "hybrid[dio+fcpe]", "hybrid[dio+rmvpe]", "hybrid[dio+harvest]", "hybrid[dio+yin]", "hybrid[crepe-tiny+crepe]", "hybrid[crepe-tiny+fcpe]", "hybrid[crepe-tiny+rmvpe]", "hybrid[crepe-tiny+harvest]", "hybrid[crepe+fcpe]", "hybrid[crepe+rmvpe]", "hybrid[crepe+harvest]", "hybrid[crepe+yin]", "hybrid[fcpe+rmvpe]", "hybrid[fcpe+harvest]", "hybrid[fcpe+yin]", "hybrid[rmvpe+harvest]", "hybrid[rmvpe+yin]", "hybrid[harvest+yin]"], value="hybrid[pm+dio]", interactive=True, allow_custom_value=True, visible=method0.value == "hybrid")
350
+ hop_length0 = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=512, value=128, step=1, interactive=True, visible=False)
351
+ with gr.Accordion(translations["f0_file"], open=False):
352
+ upload_f0_file0 = gr.File(label=translations["upload_f0"], file_types=[".txt"])
353
+ f0_file_dropdown0 = gr.Dropdown(label=translations["f0_file_2"], value="", choices=f0_file, allow_custom_value=True, interactive=True)
354
+ refesh_f0_file0 = gr.Button(translations["refesh"])
355
+ with gr.Accordion(translations["hubert_model"], open=False):
356
+ embed_mode1 = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
357
+ embedders0 = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
358
+ custom_embedders0 = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders0.value == "custom")
359
+ with gr.Group():
360
+ with gr.Row():
361
+ formant_shifting1 = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True)
362
+ split_audio0 = gr.Checkbox(label=translations["split_audio"], value=False, interactive=True)
363
+ cleaner1 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
364
+ autotune3 = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
365
+ checkpointing0 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
366
+ with gr.Column():
367
+ f0_autotune_strength0 = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune3.value)
368
+ clean_strength1 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner1.value)
369
+ resample_sr0 = gr.Slider(minimum=0, maximum=96000, label=translations["resample"], info=translations["resample_info"], value=0, step=1, interactive=True)
370
+ filter_radius0 = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
371
+ volume_envelope0 = gr.Slider(minimum=0, maximum=1, label=translations["volume_envelope"], info=translations["volume_envelope_info"], value=1, step=0.1, interactive=True)
372
+ protect0 = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
373
+ with gr.Row():
374
+ formant_qfrency1 = gr.Slider(value=1.0, label=translations["formant_qfrency"], info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
375
+ formant_timbre1 = gr.Slider(value=1.0, label=translations["formant_timbre"], info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
376
+ with gr.Row():
377
+ gr.Markdown(translations["output_tts_markdown"])
378
+ with gr.Row():
379
+ tts_voice_audio = gr.Audio(show_download_button=True, interactive=False, label=translations["output_text_to_speech"])
380
+ tts_voice_convert = gr.Audio(show_download_button=True, interactive=False, label=translations["output_file_tts_convert"])
381
+ with gr.Row():
382
+ unlock_full_method3.change(fn=unlock_f0, inputs=[unlock_full_method3], outputs=[method0])
383
+ upload_f0_file0.upload(fn=lambda inp: shutil.move(inp.name, os.path.join("assets", "f0")), inputs=[upload_f0_file0], outputs=[f0_file_dropdown0])
384
+ refesh_f0_file0.click(fn=change_f0_choices, inputs=[], outputs=[f0_file_dropdown0])
385
+ with gr.Row():
386
+ embed_mode1.change(fn=visible_embedders, inputs=[embed_mode1], outputs=[embedders0])
387
+ autotune3.change(fn=visible, inputs=[autotune3], outputs=[f0_autotune_strength0])
388
+ model_pth0.change(fn=get_index, inputs=[model_pth0], outputs=[model_index0])
389
+ with gr.Row():
390
+ cleaner1.change(fn=visible, inputs=[cleaner1], outputs=[clean_strength1])
391
+ method0.change(fn=lambda method, hybrid: [visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method0, hybrid_method0], outputs=[hybrid_method0, hop_length0])
392
+ hybrid_method0.change(fn=hoplength_show, inputs=[method0, hybrid_method0], outputs=[hop_length0])
393
+ with gr.Row():
394
+ refesh1.click(fn=change_models_choices, inputs=[], outputs=[model_pth0, model_index0])
395
+ embedders0.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders0], outputs=[custom_embedders0])
396
+ formant_shifting1.change(fn=lambda a: [visible(a)]*2, inputs=[formant_shifting1], outputs=[formant_qfrency1, formant_timbre1])
397
+ with gr.Row():
398
+ model_index0.change(fn=index_strength_show, inputs=[model_index0], outputs=[index_strength0])
399
+ txt_input.upload(fn=process_input, inputs=[txt_input], outputs=[prompt])
400
+ use_txt.change(fn=visible, inputs=[use_txt], outputs=[txt_input])
401
+ with gr.Row():
402
+ google_tts_check_box.change(fn=change_tts_voice_choices, inputs=[google_tts_check_box], outputs=[tts_voice])
403
+ tts_button.click(
404
+ fn=TTS,
405
+ inputs=[
406
+ prompt,
407
+ tts_voice,
408
+ speed,
409
+ output_audio0,
410
+ tts_pitch,
411
+ google_tts_check_box,
412
+ txt_input
413
+ ],
414
+ outputs=[tts_voice_audio],
415
+ api_name="text-to-speech"
416
+ )
417
+ convert_button0.click(
418
+ fn=convert_tts,
419
+ inputs=[
420
+ cleaner1,
421
+ autotune3,
422
+ pitch0,
423
+ clean_strength1,
424
+ model_pth0,
425
+ model_index0,
426
+ index_strength0,
427
+ output_audio0,
428
+ output_audio1,
429
+ export_format0,
430
+ method0,
431
+ hybrid_method0,
432
+ hop_length0,
433
+ embedders0,
434
+ custom_embedders0,
435
+ resample_sr0,
436
+ filter_radius0,
437
+ volume_envelope0,
438
+ protect0,
439
+ split_audio0,
440
+ f0_autotune_strength0,
441
+ checkpointing0,
442
+ onnx_f0_mode1,
443
+ formant_shifting1,
444
+ formant_qfrency1,
445
+ formant_timbre1,
446
+ f0_file_dropdown0,
447
+ embed_mode1
448
+ ],
449
+ outputs=[tts_voice_convert],
450
+ api_name="convert_tts"
451
+ )
452
+
453
+ # Whisper Conversion Tab
454
+ with gr.TabItem(translations["convert_with_whisper"], visible=configs.get("convert_with_whisper", True)):
455
+ gr.Markdown(f"## {translations['convert_with_whisper']}")
456
+ with gr.Row():
457
+ gr.Markdown(translations["convert_with_whisper_info"])
458
+ with gr.Row():
459
+ with gr.Column():
460
+ with gr.Group():
461
+ with gr.Row():
462
+ cleaner2 = gr.Checkbox(label=translations["clear_audio"], value=False, interactive=True)
463
+ autotune2 = gr.Checkbox(label=translations["autotune"], value=False, interactive=True)
464
+ checkpointing2 = gr.Checkbox(label=translations["memory_efficient_training"], value=False, interactive=True)
465
+ formant_shifting2 = gr.Checkbox(label=translations["formantshift"], value=False, interactive=True)
466
+ with gr.Row():
467
+ num_spk = gr.Slider(minimum=2, maximum=8, step=1, info=translations["num_spk_info"], label=translations["num_spk"], value=2, interactive=True)
468
+ with gr.Row():
469
+ with gr.Column():
470
+ convert_button3 = gr.Button(translations["convert_audio"], variant="primary")
471
+ with gr.Row():
472
+ with gr.Column():
473
+ with gr.Accordion(translations["model_accordion"] + " 1", open=True):
474
+ with gr.Row():
475
+ model_pth2 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
476
+ model_index2 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
477
+ with gr.Row():
478
+ refesh2 = gr.Button(translations["refesh"])
479
+ with gr.Row():
480
+ pitch3 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
481
+ index_strength2 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index2.value != "")
482
+ with gr.Accordion(translations["input_output"], open=False):
483
+ with gr.Column():
484
+ export_format2 = gr.Radio(label=translations["export_format"], info=translations["export_info"], choices=["wav", "mp3", "flac", "ogg", "opus", "m4a", "mp4", "aac", "alac", "wma", "aiff", "webm", "ac3"], value="wav", interactive=True)
485
+ input_audio1 = gr.Dropdown(label=translations["audio_path"], value="", choices=paths_for_files, info=translations["provide_audio"], allow_custom_value=True, interactive=True)
486
+ output_audio2 = gr.Textbox(label=translations["output_path"], value="audios/output.wav", placeholder="audios/output.wav", info=translations["output_path_info"], interactive=True)
487
+ with gr.Column():
488
+ refesh4 = gr.Button(translations["refesh"])
489
+ with gr.Row():
490
+ input2 = gr.File(label=translations["drop_audio"], file_types=[".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".mp4", ".aac", ".alac", ".wma", ".aiff", ".webm", ".ac3"])
491
+ with gr.Column():
492
+ with gr.Accordion(translations["model_accordion"] + " 2", open=True):
493
+ with gr.Row():
494
+ model_pth3 = gr.Dropdown(label=translations["model_name"], choices=model_name, value=model_name[0] if len(model_name) >= 1 else "", interactive=True, allow_custom_value=True)
495
+ model_index3 = gr.Dropdown(label=translations["index_path"], choices=index_path, value=index_path[0] if len(index_path) >= 1 else "", interactive=True, allow_custom_value=True)
496
+ with gr.Row():
497
+ refesh3 = gr.Button(translations["refesh"])
498
+ with gr.Row():
499
+ pitch4 = gr.Slider(minimum=-20, maximum=20, step=1, info=translations["pitch_info"], label=translations["pitch"], value=0, interactive=True)
500
+ index_strength3 = gr.Slider(label=translations["index_strength"], info=translations["index_strength_info"], minimum=0, maximum=1, value=0.5, step=0.01, interactive=True, visible=model_index3.value != "")
501
+ with gr.Accordion(translations["setting"], open=False):
502
+ with gr.Row():
503
+ model_size = gr.Radio(label=translations["model_size"], info=translations["model_size_info"], choices=["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3", "large-v3-turbo"], value="medium", interactive=True)
504
+ with gr.Accordion(translations["f0_method"], open=False):
505
+ with gr.Group():
506
+ with gr.Row():
507
+ onnx_f0_mode4 = gr.Checkbox(label=translations["f0_onnx_mode"], info=translations["f0_onnx_mode_info"], value=False, interactive=True)
508
+ unlock_full_method2 = gr.Checkbox(label=translations["f0_unlock"], info=translations["f0_unlock_info"], value=False, interactive=True)
509
+ method3 = gr.Radio(label=translations["f0_method"], info=translations["f0_method_info"], choices=method_f0+["hybrid"], value="rmvpe", interactive=True)
510
+ hybrid_method3 = gr.Dropdown(label=translations["f0_method_hybrid"], info=translations["f0_method_hybrid_info"], choices=["hybrid[pm+dio]", "hybrid[pm+crepe-tiny]", "hybrid[pm+crepe]", "hybrid[pm+fcpe]", "hybrid[pm+rmvpe]", "hybrid[pm+harvest]", "hybrid[pm+yin]", "hybrid[dio+crepe-tiny]", "hybrid[dio+crepe]", "hybrid[dio+fcpe]", "hybrid[dio+rmvpe]", "hybrid[dio+harvest]", "hybrid[dio+yin]", "hybrid[crepe-tiny+crepe]", "hybrid[crepe-tiny+fcpe]", "hybrid[crepe-tiny+rmvpe]", "hybrid[crepe-tiny+harvest]", "hybrid[crepe+fcpe]", "hybrid[crepe+rmvpe]", "hybrid[crepe+harvest]", "hybrid[crepe+yin]", "hybrid[fcpe+rmvpe]", "hybrid[fcpe+harvest]", "hybrid[fcpe+yin]", "hybrid[rmvpe+harvest]", "hybrid[rmvpe+yin]", "hybrid[harvest+yin]"], value="hybrid[pm+dio]", interactive=True, allow_custom_value=True, visible=method3.value == "hybrid")
511
+ hop_length3 = gr.Slider(label="Hop length", info=translations["hop_length_info"], minimum=1, maximum=512, value=128, step=1, interactive=True, visible=False)
512
+ with gr.Accordion(translations["hubert_model"], open=False):
513
+ embed_mode3 = gr.Radio(label=translations["embed_mode"], info=translations["embed_mode_info"], value="fairseq", choices=embedders_mode, interactive=True, visible=True)
514
+ embedders3 = gr.Radio(label=translations["hubert_model"], info=translations["hubert_info"], choices=embedders_model, value="hubert_base", interactive=True)
515
+ custom_embedders3 = gr.Textbox(label=translations["modelname"], info=translations["modelname_info"], value="", placeholder="hubert_base", interactive=True, visible=embedders3.value == "custom")
516
+ with gr.Column():
517
+ clean_strength3 = gr.Slider(label=translations["clean_strength"], info=translations["clean_strength_info"], minimum=0, maximum=1, value=0.5, step=0.1, interactive=True, visible=cleaner2.value)
518
+ f0_autotune_strength3 = gr.Slider(minimum=0, maximum=1, label=translations["autotune_rate"], info=translations["autotune_rate_info"], value=1, step=0.1, interactive=True, visible=autotune.value)
519
+ resample_sr3 = gr.Slider(minimum=0, maximum=96000, label=translations["resample"], info=translations["resample_info"], value=0, step=1, interactive=True)
520
+ filter_radius3 = gr.Slider(minimum=0, maximum=7, label=translations["filter_radius"], info=translations["filter_radius_info"], value=3, step=1, interactive=True)
521
+ volume_envelope3 = gr.Slider(minimum=0, maximum=1, label=translations["volume_envelope"], info=translations["volume_envelope_info"], value=1, step=0.1, interactive=True)
522
+ protect3 = gr.Slider(minimum=0, maximum=1, label=translations["protect"], info=translations["protect_info"], value=0.5, step=0.01, interactive=True)
523
+ with gr.Row():
524
+ formant_qfrency3 = gr.Slider(value=1.0, label=translations["formant_qfrency"] + " 1", info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
525
+ formant_timbre3 = gr.Slider(value=1.0, label=translations["formant_timbre"] + " 1", info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
526
+ with gr.Row():
527
+ formant_qfrency4 = gr.Slider(value=1.0, label=translations["formant_qfrency"] + " 2", info=translations["formant_qfrency"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
528
+ formant_timbre4 = gr.Slider(value=1.0, label=translations["formant_timbre"] + " 2", info=translations["formant_timbre"], minimum=0.0, maximum=16.0, step=0.1, interactive=True, visible=False)
529
+ with gr.Row():
530
+ gr.Markdown(translations["input_output"])
531
+ with gr.Row():
532
+ play_audio2 = gr.Audio(show_download_button=True, interactive=False, label=translations["input_audio"])
533
+ play_audio3 = gr.Audio(show_download_button=True, interactive=False, label=translations["output_file_tts_convert"])
534
+ with gr.Row():
535
+ autotune2.change(fn=visible, inputs=[autotune2], outputs=[f0_autotune_strength3])
536
+ cleaner2.change(fn=visible, inputs=[cleaner2], outputs=[clean_strength3])
537
+ method3.change(fn=lambda method, hybrid: [visible(method == "hybrid"), hoplength_show(method, hybrid)], inputs=[method3, hybrid_method3], outputs=[hybrid_method3, hop_length3])
538
+ with gr.Row():
539
+ hybrid_method3.change(fn=hoplength_show, inputs=[method3, hybrid_method3], outputs=[hop_length3])
540
+ refesh2.click(fn=change_models_choices, inputs=[], outputs=[model_pth2, model_index2])
541
+ model_pth2.change(fn=get_index, inputs=[model_pth2], outputs=[model_index2])
542
+ with gr.Row():
543
+ refesh3.click(fn=change_models_choices, inputs=[], outputs=[model_pth3, model_index3])
544
+ model_pth3.change(fn=get_index, inputs=[model_pth3], outputs=[model_index3])
545
+ input2.upload(fn=lambda audio_in: shutil.move(audio_in.name, os.path.join("audios")), inputs=[input2], outputs=[input_audio1])
546
+ with gr.Row():
547
+ input_audio1.change(fn=lambda audio: audio if os.path.isfile(audio) else None, inputs=[input_audio1], outputs=[play_audio2])
548
+ formant_shifting2.change(fn=lambda a: [visible(a)]*4, inputs=[formant_shifting2], outputs=[formant_qfrency3, formant_timbre3, formant_qfrency4, formant_timbre4])
549
+ embedders3.change(fn=lambda embedders: visible(embedders == "custom"), inputs=[embedders3], outputs=[custom_embedders3])
550
+ with gr.Row():
551
+ refesh4.click(fn=change_audios_choices, inputs=[input_audio1], outputs=[input_audio1])
552
+ model_index2.change(fn=index_strength_show, inputs=[model_index2], outputs=[index_strength2])
553
+ model_index3.change(fn=index_strength_show, inputs=[model_index3], outputs=[index_strength3])
554
+ with gr.Row():
555
+ unlock_full_method2.change(fn=unlock_f0, inputs=[unlock_full_method2], outputs=[method3])
556
+ embed_mode3.change(fn=visible_embedders, inputs=[embed_mode3], outputs=[embedders3])
557
+ convert_button3.click(
558
+ fn=convert_with_whisper,
559
+ inputs=[
560
+ num_spk,
561
+ model_size,
562
+ cleaner2,
563
+ clean_strength3,
564
+ autotune2,
565
+ f0_autotune_strength3,
566
+ checkpointing2,
567
+ model_pth2,
568
+ model_pth3,
569
+ model_index2,
570
+ model_index3,
571
+ pitch3,
572
+ pitch4,
573
+ index_strength2,
574
+ index_strength3,
575
+ export_format2,
576
+ input_audio1,
577
+ output_audio2,
578
+ onnx_f0_mode4,
579
+ method3,
580
+ hybrid_method3,
581
+ hop_length3,
582
+ embed_mode3,
583
+ embedders3,
584
+ custom_embedders3,
585
+ resample_sr3,
586
+ filter_radius3,
587
+ volume_envelope3,
588
+ protect3,
589
+ formant_shifting2,
590
+ formant_qfrency3,
591
+ formant_timbre3,
592
+ formant_qfrency4,
593
+ formant_timbre4,
594
+ ],
595
+ outputs=[play_audio3],
596
+ api_name="convert_with_whisper"
597
+ )