Update app.py
Browse files
app.py
CHANGED
|
@@ -114,13 +114,25 @@ class App:
|
|
| 114 |
cb_diarize = gr.Checkbox(value=diarization_params["is_diarize"],label="Use diarization",interactive=True)
|
| 115 |
tb_hf_token = gr.Text(label="Token", value=diarization_params["hf_token"],info="An access token is required to use diarization & can be created [here](https://hf.co/settings/tokens). If not done yet for your account, you need to accept the terms & conditions of [diarization](https://huggingface.co/pyannote/speaker-diarization-3.1) & [segmentation](https://huggingface.co/pyannote/segmentation-3.0)")
|
| 116 |
|
| 117 |
-
with gr.Accordion("
|
| 118 |
-
cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
|
| 119 |
-
interactive=True,
|
| 120 |
-
info="Enable to remove background music before transcribing")
|
| 121 |
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
with gr.Accordion("Advanced options", open=False, visible=False):
|
| 126 |
with gr.Accordion("Advanced diarization options", open=False, visible=True):
|
|
@@ -200,9 +212,9 @@ class App:
|
|
| 200 |
nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
|
| 201 |
|
| 202 |
with gr.Accordion("Background Music Remover Filter", open=False):
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
|
| 207 |
choices=self.whisper_inf.music_separator.available_devices)
|
| 208 |
dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
|
|
@@ -212,25 +224,25 @@ class App:
|
|
| 212 |
cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music",
|
| 213 |
value=uvr_params["enable_offload"])
|
| 214 |
|
| 215 |
-
with gr.Accordion("Voice Detection Filter", open=False):
|
| 216 |
# cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
|
| 217 |
# interactive=True,
|
| 218 |
# info="Enable this to transcribe only detected voice parts by submodel.")
|
| 219 |
-
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
|
| 220 |
-
value=vad_params["threshold"],
|
| 221 |
-
info="Lower it to be more sensitive to small sounds.")
|
| 222 |
-
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
|
| 223 |
-
value=vad_params["min_speech_duration_ms"],
|
| 224 |
-
info="Final speech chunks shorter than this time are thrown out")
|
| 225 |
-
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)",
|
| 226 |
-
value=vad_params["max_speech_duration_s"],
|
| 227 |
-
info="Maximum duration of speech chunks in \"seconds\".")
|
| 228 |
-
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
|
| 229 |
-
value=vad_params["min_silence_duration_ms"],
|
| 230 |
-
info="In the end of each speech chunk wait for this time"
|
| 231 |
-
" before separating it")
|
| 232 |
-
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
|
| 233 |
-
info="Final speech chunks are padded by this time each side")
|
| 234 |
|
| 235 |
#dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
|
| 236 |
|
|
|
|
| 114 |
cb_diarize = gr.Checkbox(value=diarization_params["is_diarize"],label="Use diarization",interactive=True)
|
| 115 |
tb_hf_token = gr.Text(label="Token", value=diarization_params["hf_token"],info="An access token is required to use diarization & can be created [here](https://hf.co/settings/tokens). If not done yet for your account, you need to accept the terms & conditions of [diarization](https://huggingface.co/pyannote/speaker-diarization-3.1) & [segmentation](https://huggingface.co/pyannote/segmentation-3.0)")
|
| 116 |
|
| 117 |
+
with gr.Accordion("Voice Detection Filter", open=False, visible=True):
|
|
|
|
|
|
|
|
|
|
| 118 |
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
|
| 119 |
+
interactive=True,
|
| 120 |
+
info="Enable to transcribe only detected voice parts")
|
| 121 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
|
| 122 |
+
value=vad_params["threshold"],
|
| 123 |
+
info="Lower it to be more sensitive to small sounds")
|
| 124 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
|
| 125 |
+
value=vad_params["min_speech_duration_ms"],
|
| 126 |
+
info="Final speech chunks shorter than this time are thrown out")
|
| 127 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)",
|
| 128 |
+
value=vad_params["max_speech_duration_s"],
|
| 129 |
+
info="Maximum duration of speech chunks in \"seconds\"")
|
| 130 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
|
| 131 |
+
value=vad_params["min_silence_duration_ms"],
|
| 132 |
+
info="In the end of each speech chunk wait for this time"
|
| 133 |
+
" before separating it")
|
| 134 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
|
| 135 |
+
info="Final speech chunks are padded by this time each side")
|
| 136 |
|
| 137 |
with gr.Accordion("Advanced options", open=False, visible=False):
|
| 138 |
with gr.Accordion("Advanced diarization options", open=False, visible=True):
|
|
|
|
| 212 |
nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
|
| 213 |
|
| 214 |
with gr.Accordion("Background Music Remover Filter", open=False):
|
| 215 |
+
cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
|
| 216 |
+
interactive=True,
|
| 217 |
+
info="Enabling this will remove background music by submodel before transcribing.")
|
| 218 |
dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
|
| 219 |
choices=self.whisper_inf.music_separator.available_devices)
|
| 220 |
dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
|
|
|
|
| 224 |
cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music",
|
| 225 |
value=uvr_params["enable_offload"])
|
| 226 |
|
| 227 |
+
# with gr.Accordion("Voice Detection Filter", open=False):
|
| 228 |
# cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
|
| 229 |
# interactive=True,
|
| 230 |
# info="Enable this to transcribe only detected voice parts by submodel.")
|
| 231 |
+
# sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
|
| 232 |
+
# value=vad_params["threshold"],
|
| 233 |
+
# info="Lower it to be more sensitive to small sounds.")
|
| 234 |
+
# nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
|
| 235 |
+
# value=vad_params["min_speech_duration_ms"],
|
| 236 |
+
# info="Final speech chunks shorter than this time are thrown out")
|
| 237 |
+
# nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)",
|
| 238 |
+
# value=vad_params["max_speech_duration_s"],
|
| 239 |
+
# info="Maximum duration of speech chunks in \"seconds\".")
|
| 240 |
+
# nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
|
| 241 |
+
# value=vad_params["min_silence_duration_ms"],
|
| 242 |
+
# info="In the end of each speech chunk wait for this time"
|
| 243 |
+
# " before separating it")
|
| 244 |
+
# nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
|
| 245 |
+
# info="Final speech chunks are padded by this time each side")
|
| 246 |
|
| 247 |
#dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
|
| 248 |
|