LAP-DEV commited on
Commit
59414b9
·
verified ·
1 Parent(s): 71df80d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -39
app.py CHANGED
@@ -101,7 +101,6 @@ class App:
101
  with gr.Row():
102
  dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],label="Model", info="Larger models increase transcription quality, but reduce performance", interactive=True)
103
  dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,value=whisper_params["lang"], label="Language", info="If the language is known upfront, always set it manually", interactive=True)
104
- #dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
105
  dd_file_format = gr.Dropdown(choices=["TXT","SRT"], value="TXT", label="Output format", info="Output preview format", interactive=True, visible=False)
106
  with gr.Row():
107
  dd_translate_model = gr.Dropdown(choices=self.nllb_inf.available_models, value=nllb_params["model_size"],label="Model", info="Model used for translation", interactive=True)
@@ -110,9 +109,9 @@ class App:
110
  with gr.Row():
111
  cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add timestamp to output file",interactive=True)
112
  with gr.Row():
113
- cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label="Translate transcription to English", info="Translate using OpenAI Whisper's built-in module",interactive=True)
114
  with gr.Row():
115
- cb_translate_output = gr.Checkbox(value=translation_params["translate_output"], label="Translate output to selected language", info="Translate using Facebook's NLLB",interactive=True)
116
 
117
  with gr.Accordion("Speaker diarization", open=False, visible=True):
118
  cb_diarize = gr.Checkbox(value=diarization_params["is_diarize"],label="Use diarization",interactive=True)
@@ -122,42 +121,44 @@ class App:
122
  value=self.whisper_inf.diarizer.get_device(),
123
  interactive=True, visible=False)
124
 
125
- with gr.Accordion("Voice Detection Filter (⚠ experimental)", open=False, visible=True):
126
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
127
- interactive=True,
128
- info="Enable to transcribe only detected voice parts")
129
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
130
- value=vad_params["threshold"],
131
- info="Lower it to be more sensitive to small sounds")
132
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
133
- value=vad_params["min_speech_duration_ms"],
134
- info="Final speech chunks shorter than this time are thrown out")
135
- nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)",
136
- value=vad_params["max_speech_duration_s"],
137
- info="Maximum duration of speech chunks in seconds")
138
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
139
- value=vad_params["min_silence_duration_ms"],
140
- info="In the end of each speech chunk wait for this time"
141
- " before separating it")
142
- nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
143
- info="Final speech chunks are padded by this time each side")
144
-
145
- with gr.Accordion("Background Music Remover Filter (⚠ experimental)", open=False):
146
- cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
147
- info="Enable to remove background music by submodel before transcribing",
148
  interactive=True)
149
- dd_uvr_device = gr.Dropdown(label="Device",
150
- value=self.whisper_inf.music_separator.device,
151
- choices=self.whisper_inf.music_separator.available_devices,
152
- interactive=True, visible=False)
153
- dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
154
- choices=self.whisper_inf.music_separator.available_models)
155
- nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0,
156
- interactive=True, visible=False)
157
- cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"],
158
- interactive=True, visible=False)
159
- cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music",value=uvr_params["enable_offload"],
160
- interactive=True, visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  with gr.Accordion("Advanced processing options", open=False, visible=False):
163
  nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
@@ -293,7 +294,7 @@ class App:
293
  btn_reset.click(None,js="window.location.reload()")
294
  with gr.Row():
295
  with gr.Column(scale=4):
296
- tb_indicator = gr.Textbox(label="Output preview (Always review & verify the output generated by AI models)", show_copy_button=True, show_label=True)
297
  with gr.Column(scale=1):
298
  tb_info = gr.Textbox(label="Output info", interactive=False, show_copy_button=True)
299
  files_subtitles = gr.Files(label="Output data", interactive=False, file_count="multiple")
 
101
  with gr.Row():
102
  dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],label="Model", info="Larger models increase transcription quality, but reduce performance", interactive=True)
103
  dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,value=whisper_params["lang"], label="Language", info="If the language is known upfront, always set it manually", interactive=True)
 
104
  dd_file_format = gr.Dropdown(choices=["TXT","SRT"], value="TXT", label="Output format", info="Output preview format", interactive=True, visible=False)
105
  with gr.Row():
106
  dd_translate_model = gr.Dropdown(choices=self.nllb_inf.available_models, value=nllb_params["model_size"],label="Model", info="Model used for translation", interactive=True)
 
109
  with gr.Row():
110
  cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add timestamp to output file",interactive=True)
111
  with gr.Row():
112
+ cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label="Translate to English", info="Translate using OpenAI Whisper's built-in module",interactive=True)
113
  with gr.Row():
114
+ cb_translate_output = gr.Checkbox(value=translation_params["translate_output"], label="Translate to selected language", info="Translate using Facebook's NLLB",interactive=True)
115
 
116
  with gr.Accordion("Speaker diarization", open=False, visible=True):
117
  cb_diarize = gr.Checkbox(value=diarization_params["is_diarize"],label="Use diarization",interactive=True)
 
121
  value=self.whisper_inf.diarizer.get_device(),
122
  interactive=True, visible=False)
123
 
124
+ with gr.Accordion("Preprocessing options (⚠ Beta)", open=False, visible=True):
125
+ with gr.Accordion("Voice Detection Filter", open=False, visible=True):
126
+ cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
127
+ info="Enable to transcribe only detected voice parts",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  interactive=True)
129
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
130
+ value=vad_params["threshold"],
131
+ info="Lower it to be more sensitive to small sounds")
132
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
133
+ value=vad_params["min_speech_duration_ms"],
134
+ info="Final speech chunks shorter than this time are thrown out")
135
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)",
136
+ value=vad_params["max_speech_duration_s"],
137
+ info="Maximum duration of speech chunks in seconds")
138
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
139
+ value=vad_params["min_silence_duration_ms"],
140
+ info="In the end of each speech chunk wait for this time"
141
+ " before separating it")
142
+ nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
143
+ info="Final speech chunks are padded by this time each side")
144
+
145
+ with gr.Accordion("Background Music Remover Filter", open=False):
146
+ cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
147
+ info="Enable to remove background music by submodel before transcribing",
148
+ interactive=True)
149
+ dd_uvr_device = gr.Dropdown(label="Device",
150
+ value=self.whisper_inf.music_separator.device,
151
+ choices=self.whisper_inf.music_separator.available_devices,
152
+ interactive=True, visible=False)
153
+ dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
154
+ choices=self.whisper_inf.music_separator.available_models,
155
+ interactive=True)
156
+ nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0,
157
+ interactive=True, visible=False)
158
+ cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"],
159
+ interactive=True, visible=False)
160
+ cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music",value=uvr_params["enable_offload"],
161
+ interactive=True, visible=False)
162
 
163
  with gr.Accordion("Advanced processing options", open=False, visible=False):
164
  nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
 
294
  btn_reset.click(None,js="window.location.reload()")
295
  with gr.Row():
296
  with gr.Column(scale=4):
297
+ tb_indicator = gr.Textbox(label="Output preview (Always review output generated by AI models)", show_copy_button=True, show_label=True)
298
  with gr.Column(scale=1):
299
  tb_info = gr.Textbox(label="Output info", interactive=False, show_copy_button=True)
300
  files_subtitles = gr.Files(label="Output data", interactive=False, file_count="multiple")