Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Aug 27, 2024

Commit

31adf69

2 Parent(s): 2353351 0b0f426

Merge master

Browse files

Files changed (22) hide show

.gitignore +4 -0
app.py +220 -276
docker-compose.yaml +31 -0
modules/diarize/audio_loader.py +26 -8
modules/diarize/diarize_pipeline.py +2 -0
modules/diarize/diarizer.py +3 -7
modules/translation/deepl_api.py +15 -14
modules/translation/nllb_inference.py +25 -9
modules/translation/translation_base.py +21 -27
modules/utils/files_manager.py +39 -0
modules/utils/subtitle_manager.py +1 -3
modules/utils/youtube_manager.py +1 -1
modules/vad/silero_vad.py +41 -20
modules/whisper/faster_whisper_inference.py +55 -34
modules/whisper/insanely_fast_whisper_inference.py +7 -4
modules/whisper/whisper_Inference.py +5 -4
modules/whisper/whisper_base.py +66 -57
modules/whisper/whisper_factory.py +81 -0
modules/whisper/whisper_parameter.py +92 -35
notebook/whisper-webui.ipynb +6 -5
requirements.txt +9 -4
start-webui.bat +1 -1

.gitignore CHANGED Viewed

@@ -1,3 +1,7 @@
 venv/
 ui/__pycache__/
 outputs/

+*.wav
+*.png
+*.mp4
+*.mp3
 venv/
 ui/__pycache__/
 outputs/

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import os
 import argparse
-from modules.whisper.whisper_Inference import WhisperInference
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
 from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
 from modules.translation.nllb_inference import NLLBInference
@@ -15,68 +16,150 @@ class App:
     def __init__(self, args):
         self.args = args
         self.app = gr.Blocks(css=CSS, theme=self.args.theme)
-        self.whisper_inf = FasterWhisperInference(
-            model_dir=self.args.faster_whisper_model_dir,
             output_dir=self.args.output_dir,
-            args=self.args
         )
         print(f"Use \"{self.args.whisper_type}\" implementation")
         print(f"Device \"{self.whisper_inf.device}\" is detected")
         self.nllb_inf = NLLBInference(
             model_dir=self.args.nllb_model_dir,
-            output_dir=self.args.output_dir
         )
         self.deepl_api = DeepLAPI(
-            output_dir=self.args.output_dir
         )
-    def init_whisper(self):
-        # Temporal fix of the issue : https://github.com/jhj0517/Whisper-WebUI/issues/144
-        os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
-        whisper_type = self.args.whisper_type.lower().strip()
-        if whisper_type in ["faster_whisper", "faster-whisper", "fasterwhisper"]:
-            whisper_inf = FasterWhisperInference(
-                model_dir=self.args.faster_whisper_model_dir,
-                output_dir=self.args.output_dir,
-                args=self.args
-            )
-        elif whisper_type in ["whisper"]:
-            whisper_inf = WhisperInference(
-                model_dir=self.args.whisper_model_dir,
-                output_dir=self.args.output_dir,
-                args=self.args
-            )
-        elif whisper_type in ["insanely_fast_whisper", "insanely-fast-whisper", "insanelyfastwhisper",
-                              "insanely_faster_whisper", "insanely-faster-whisper", "insanelyfasterwhisper"]:
-            whisper_inf = InsanelyFastWhisperInference(
-                model_dir=self.args.insanely_fast_whisper_model_dir,
-                output_dir=self.args.output_dir,
-                args=self.args
-            )
-        else:
-            whisper_inf = FasterWhisperInference(
-                model_dir=self.args.faster_whisper_model_dir,
-                output_dir=self.args.output_dir,
-                args=self.args
-            )
-        return whisper_inf
-    @staticmethod
-    def open_folder(folder_path: str):
-        if os.path.exists(folder_path):
-            os.system(f"start {folder_path}")
-        else:
-            print(f"The folder {folder_path} does not exist.")
-    @staticmethod
-    def on_change_models(model_size: str):
-        translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
-        if model_size not in translatable_model:
-            return gr.Checkbox(visible=False, value=False, interactive=False)
-        else:
-            return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
     def launch(self):
         with self.app:
@@ -85,84 +168,28 @@ class App:
                     gr.Markdown(MARKDOWN, elem_id="md_project")
             with gr.Tabs():
                 with gr.TabItem("File"):  # tab1
-                    with gr.Row():
                         input_file = gr.Files(type="filepath", label="Upload File here")
-                    with gr.Row():
-                        dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v2",
-                                               label="Model")
-                        dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
-                                              value="Automatic Detection", label="Language")
-                        dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
-                    with gr.Row():
-                        cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
-                    with gr.Row():
-                        cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
-                    with gr.Accordion("Advanced Parameters", open=False):
-                        nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
-                        nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
-                        nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
-                        dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
-                        nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
-                        nb_patience = gr.Number(label="Patience", value=1, interactive=True)
-                        cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
-                        tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
-                        sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
-                        nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
-                    with gr.Accordion("VAD", open=False):
-                        cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
-                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5, info="Lower it to be more sensitive to small sounds.")
-                        nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
-                        nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
-                        nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
-                        nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
-                        nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
-                    with gr.Accordion("Diarization", open=False):
-                        cb_diarize = gr.Checkbox(label="Enable Diarization")
-                        tb_hf_token = gr.Text(label="HuggingFace Token", value="",
-                                              info="This is only needed the first time you download the model. If you already have models, you don't need to enter. "
-                                                    "To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
-                        dd_diarization_device = gr.Dropdown(label="Device", choices=self.whisper_inf.diarizer.get_available_device(), value=self.whisper_inf.diarizer.get_device())
-                    with gr.Accordion("Insanely Fast Whisper Parameters", open=False, visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
-                        nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
-                        nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
-                        tb_indicator = gr.Textbox(label="Output", scale=6)
                         files_subtitles = gr.Files(label="Downloadable output file", scale=3, interactive=False)
-                    params = [input_file, dd_file_format, cb_timestamp]
-                    whisper_params = WhisperParameters(model_size=dd_model,
-                                                       lang=dd_lang,
-                                                       is_translate=cb_translate,
-                                                       beam_size=nb_beam_size,
-                                                       log_prob_threshold=nb_log_prob_threshold,
-                                                       no_speech_threshold=nb_no_speech_threshold,
-                                                       compute_type=dd_compute_type,
-                                                       best_of=nb_best_of,
-                                                       patience=nb_patience,
-                                                       condition_on_previous_text=cb_condition_on_previous_text,
-                                                       initial_prompt=tb_initial_prompt,
-                                                       temperature=sd_temperature,
-                                                       compression_ratio_threshold=nb_compression_ratio_threshold,
-                                                       vad_filter=cb_vad_filter,
-                                                       threshold=sd_threshold,
-                                                       min_speech_duration_ms=nb_min_speech_duration_ms,
-                                                       max_speech_duration_s=nb_max_speech_duration_s,
-                                                       min_silence_duration_ms=nb_min_silence_duration_ms,
-                                                       window_size_sample=nb_window_size_sample,
-                                                       speech_pad_ms=nb_speech_pad_ms,
-                                                       chunk_length_s=nb_chunk_length_s,
-                                                       batch_size=nb_batch_size,
-                                                       is_diarize=cb_diarize,
-                                                       hf_token=tb_hf_token,
-                                                       diarization_device=dd_diarization_device)
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
-                                  inputs=params+whisper_params.as_list(),
                                   outputs=[tb_indicator, files_subtitles])
-                    dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
                 with gr.TabItem("Youtube"):  # tab2
                     with gr.Row():
@@ -173,164 +200,44 @@ class App:
                         with gr.Column():
                             tb_title = gr.Label(label="Youtube Title")
                             tb_description = gr.Textbox(label="Youtube Description", max_lines=15)
-                    with gr.Row():
-                        dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v2",
-                                               label="Model")
-                        dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
-                                              value="Automatic Detection", label="Language")
-                        dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
-                    with gr.Row():
-                        cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
-                    with gr.Row():
-                        cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
-                                                   interactive=True)
-                    with gr.Accordion("Advanced Parameters", open=False):
-                        nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
-                        nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
-                        nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
-                        dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
-                        nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
-                        nb_patience = gr.Number(label="Patience", value=1, interactive=True)
-                        cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
-                        tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
-                        sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
-                        nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
-                    with gr.Accordion("VAD", open=False):
-                        cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
-                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5, info="Lower it to be more sensitive to small sounds.")
-                        nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
-                        nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
-                        nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
-                        nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
-                        nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
-                    with gr.Accordion("Diarization", open=False):
-                        cb_diarize = gr.Checkbox(label="Enable Diarization")
-                        tb_hf_token = gr.Text(label="HuggingFace Token", value="",
-                                              info="This is only needed the first time you download the model. If you already have models, you don't need to enter. "
-                                                    "To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
-                        dd_diarization_device = gr.Dropdown(label="Device", choices=self.whisper_inf.diarizer.get_available_device(), value=self.whisper_inf.diarizer.get_device())
-                    with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
-                                      visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
-                        nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
-                        nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
-                        tb_indicator = gr.Textbox(label="Output", scale=6)
                         files_subtitles = gr.Files(label="Downloadable output file", scale=3)
                     params = [tb_youtubelink, dd_file_format, cb_timestamp]
-                    whisper_params = WhisperParameters(model_size=dd_model,
-                                                       lang=dd_lang,
-                                                       is_translate=cb_translate,
-                                                       beam_size=nb_beam_size,
-                                                       log_prob_threshold=nb_log_prob_threshold,
-                                                       no_speech_threshold=nb_no_speech_threshold,
-                                                       compute_type=dd_compute_type,
-                                                       best_of=nb_best_of,
-                                                       patience=nb_patience,
-                                                       condition_on_previous_text=cb_condition_on_previous_text,
-                                                       initial_prompt=tb_initial_prompt,
-                                                       temperature=sd_temperature,
-                                                       compression_ratio_threshold=nb_compression_ratio_threshold,
-                                                       vad_filter=cb_vad_filter,
-                                                       threshold=sd_threshold,
-                                                       min_speech_duration_ms=nb_min_speech_duration_ms,
-                                                       max_speech_duration_s=nb_max_speech_duration_s,
-                                                       min_silence_duration_ms=nb_min_silence_duration_ms,
-                                                       window_size_sample=nb_window_size_sample,
-                                                       speech_pad_ms=nb_speech_pad_ms,
-                                                       chunk_length_s=nb_chunk_length_s,
-                                                       batch_size=nb_batch_size,
-                                                       is_diarize=cb_diarize,
-                                                       hf_token=tb_hf_token,
-                                                       diarization_device=dd_diarization_device)
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
                                   inputs=params + whisper_params.as_list(),
                                   outputs=[tb_indicator, files_subtitles])
                     tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
                                           outputs=[img_thumbnail, tb_title, tb_description])
-                    dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
                 with gr.TabItem("Mic"):  # tab3
                     with gr.Row():
                         mic_input = gr.Microphone(label="Record with Mic", type="filepath", interactive=True)
-                    with gr.Row():
-                        dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v2",
-                                               label="Model")
-                        dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
-                                              value="Automatic Detection", label="Language")
-                        dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
-                    with gr.Row():
-                        cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
-                    with gr.Accordion("Advanced Parameters", open=False):
-                        nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
-                        nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
-                        nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
-                        dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
-                        nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
-                        nb_patience = gr.Number(label="Patience", value=1, interactive=True)
-                        cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
-                        tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
-                        sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
-                    with gr.Accordion("VAD", open=False):
-                        cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
-                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5, info="Lower it to be more sensitive to small sounds.")
-                        nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
-                        nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
-                        nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
-                        nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
-                        nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
-                    with gr.Accordion("Diarization", open=False):
-                        cb_diarize = gr.Checkbox(label="Enable Diarization")
-                        tb_hf_token = gr.Text(label="HuggingFace Token", value="",
-                                              info="This is only needed the first time you download the model. If you already have models, you don't need to enter. "
-                                                   "To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
-                        dd_diarization_device = gr.Dropdown(label="Device",
-                                                            choices=self.whisper_inf.diarizer.get_available_device(),
-                                                            value=self.whisper_inf.diarizer.get_device())
-                    with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
-                                      visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
-                        nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
-                        nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
-                        tb_indicator = gr.Textbox(label="Output", scale=6)
                         files_subtitles = gr.Files(label="Downloadable output file", scale=3)
                     params = [mic_input, dd_file_format]
-                    whisper_params = WhisperParameters(model_size=dd_model,
-                                                       lang=dd_lang,
-                                                       is_translate=cb_translate,
-                                                       beam_size=nb_beam_size,
-                                                       log_prob_threshold=nb_log_prob_threshold,
-                                                       no_speech_threshold=nb_no_speech_threshold,
-                                                       compute_type=dd_compute_type,
-                                                       best_of=nb_best_of,
-                                                       patience=nb_patience,
-                                                       condition_on_previous_text=cb_condition_on_previous_text,
-                                                       initial_prompt=tb_initial_prompt,
-                                                       temperature=sd_temperature,
-                                                       compression_ratio_threshold=nb_compression_ratio_threshold,
-                                                       vad_filter=cb_vad_filter,
-                                                       threshold=sd_threshold,
-                                                       min_speech_duration_ms=nb_min_speech_duration_ms,
-                                                       max_speech_duration_s=nb_max_speech_duration_s,
-                                                       min_silence_duration_ms=nb_min_silence_duration_ms,
-                                                       window_size_sample=nb_window_size_sample,
-                                                       speech_pad_ms=nb_speech_pad_ms,
-                                                       chunk_length_s=nb_chunk_length_s,
-                                                       batch_size=nb_batch_size,
-                                                       is_diarize=cb_diarize,
-                                                       hf_token=tb_hf_token,
-                                                       diarization_device=dd_diarization_device)
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + whisper_params.as_list(),
                                   outputs=[tb_indicator, files_subtitles])
-                    dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
                 with gr.TabItem("T2T Translation"):  # tab 4
                     with gr.Row():
@@ -350,17 +257,25 @@ class App:
                                                                   self.deepl_api.available_target_langs.keys()))
                         with gr.Row():
                             cb_deepl_ispro = gr.Checkbox(label="Pro User?", value=False)
                         with gr.Row():
                             btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
                         with gr.Row():
                             tb_indicator = gr.Textbox(label="Output", scale=5)
                             files_subtitles = gr.Files(label="Downloadable output file", scale=3)
                     btn_run.click(fn=self.deepl_api.translate_deepl,
                                   inputs=[tb_authkey, file_subs, dd_deepl_sourcelang, dd_deepl_targetlang,
-                                          cb_deepl_ispro],
                                   outputs=[tb_indicator, files_subtitles])
                     with gr.TabItem("NLLB"):  # sub tab2
                         with gr.Row():
                             dd_nllb_model = gr.Dropdown(label="Model", value="facebook/nllb-200-1.3B",
@@ -369,6 +284,8 @@ class App:
                                                              choices=self.nllb_inf.available_source_langs)
                             dd_nllb_targetlang = gr.Dropdown(label="Target Language",
                                                              choices=self.nllb_inf.available_target_langs)
                         with gr.Row():
                             cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                                        interactive=True)
@@ -377,33 +294,53 @@ class App:
                         with gr.Row():
                             tb_indicator = gr.Textbox(label="Output", scale=5)
                             files_subtitles = gr.Files(label="Downloadable output file", scale=3)
                         with gr.Column():
                             md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
                     btn_run.click(fn=self.nllb_inf.translate_file,
-                                  inputs=[file_subs, dd_nllb_model, dd_nllb_sourcelang, dd_nllb_targetlang, cb_timestamp],
                                   outputs=[tb_indicator, files_subtitles])
         # Launch the app with optional gradio settings
-        launch_args = {}
-        if self.args.share:
-            launch_args['share'] = self.args.share
-        if self.args.server_name:
-            launch_args['server_name'] = self.args.server_name
-        if self.args.server_port:
-            launch_args['server_port'] = self.args.server_port
-        if self.args.username and self.args.password:
-            launch_args['auth'] = (self.args.username, self.args.password)
-        if self.args.root_path:
-            launch_args['root_path'] = self.args.root_path
-        launch_args['inbrowser'] = True
-        self.app.queue(api_open=False).launch(**launch_args)
 # Create the parser for command-line arguments
 parser = argparse.ArgumentParser()
-parser.add_argument('--whisper_type', type=str, default="faster-whisper", help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]')
 parser.add_argument('--share', type=bool, default=False, nargs='?', const=True, help='Gradio share value')
 parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
 parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
@@ -412,12 +349,19 @@ parser.add_argument('--username', type=str, default=None, help='Gradio authentic
 parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
 parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
 parser.add_argument('--colab', type=bool, default=False, nargs='?', const=True, help='Is colab user or not')
-parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=True, help='enable api or not')
-parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"), help='Directory path of the whisper model')
-parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"), help='Directory path of the faster-whisper model')
-parser.add_argument('--insanely_fast_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "insanely-fast-whisper"), help='Directory path of the insanely-fast-whisper model')
-parser.add_argument('--diarization_model_dir', type=str, default=os.path.join("models", "Diarization"), help='Directory path of the diarization model')
-parser.add_argument('--nllb_model_dir', type=str, default=os.path.join("models", "NLLB"), help='Directory path of the Facebook NLLB model')
 parser.add_argument('--output_dir', type=str, default=os.path.join("outputs"), help='Directory path of the outputs')
 _args = parser.parse_args()

 import os
 import argparse
+import gradio as gr
+from modules.whisper.whisper_factory import WhisperFactory
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
 from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
 from modules.translation.nllb_inference import NLLBInference
     def __init__(self, args):
         self.args = args
         self.app = gr.Blocks(css=CSS, theme=self.args.theme)
+        self.whisper_inf = WhisperFactory.create_whisper_inference(
+            whisper_type=self.args.whisper_type,
+            whisper_model_dir=self.args.whisper_model_dir,
+            faster_whisper_model_dir=self.args.faster_whisper_model_dir,
+            insanely_fast_whisper_model_dir=self.args.insanely_fast_whisper_model_dir,
             output_dir=self.args.output_dir,
         )
         print(f"Use \"{self.args.whisper_type}\" implementation")
         print(f"Device \"{self.whisper_inf.device}\" is detected")
         self.nllb_inf = NLLBInference(
             model_dir=self.args.nllb_model_dir,
+            output_dir=os.path.join(self.args.output_dir, "translations")
         )
         self.deepl_api = DeepLAPI(
+            output_dir=os.path.join(self.args.output_dir, "translations")
         )
+    def create_whisper_parameters(self):
+        with gr.Row():
+            dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v2",
+                                   label="Model")
+            dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
+                                  value="Automatic Detection", label="Language")
+            dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
+        with gr.Row():
+            cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
+        with gr.Row():
+            cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
+                                       interactive=True)
+        with gr.Accordion("Advanced Parameters", open=False):
+            nb_beam_size = gr.Number(label="Beam Size", value=5, precision=0, interactive=True,
+                                     info="Beam size to use for decoding.")
+            nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True,
+                                              info="If the average log probability over sampled tokens is below this value, treat as failed.")
+            nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True,
+                                               info="If the no speech probability is higher than this value AND the average log probability over sampled tokens is below 'Log Prob Threshold', consider the segment as silent.")
+            dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types,
+                                          value=self.whisper_inf.current_compute_type, interactive=True,
+                                          info="Select the type of computation to perform.")
+            nb_best_of = gr.Number(label="Best Of", value=5, interactive=True,
+                                   info="Number of candidates when sampling with non-zero temperature.")
+            nb_patience = gr.Number(label="Patience", value=1, interactive=True,
+                                    info="Beam search patience factor.")
+            cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True,
+                                                        interactive=True,
+                                                        info="Condition on previous text during decoding.")
+            sld_prompt_reset_on_temperature = gr.Slider(label="Prompt Reset On Temperature", value=0.5,
+                                                        minimum=0, maximum=1, step=0.01, interactive=True,
+                                                        info="Resets prompt if temperature is above this value."
+                                                             " Arg has effect only if 'Condition On Previous Text' is True.")
+            tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True,
+                                           info="Initial prompt to use for decoding.")
+            sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True,
+                                       info="Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `Compression Ratio Threshold` or `Log Prob Threshold`.")
+            nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True,
+                                                       info="If the gzip compression ratio is above this value, treat as failed.")
+            with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
+                nb_length_penalty = gr.Number(label="Length Penalty", value=1,
+                                              info="Exponential length penalty constant.")
+                nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=1,
+                                                  info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
+                nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=0, precision=0,
+                                                    info="Prevent repetitions of n-grams with this size (set 0 to disable).")
+                tb_prefix = gr.Textbox(label="Prefix", value=lambda: None,
+                                       info="Optional text to provide as a prefix for the first window.")
+                cb_suppress_blank = gr.Checkbox(label="Suppress Blank", value=True,
+                                                info="Suppress blank outputs at the beginning of the sampling.")
+                tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value="[-1]",
+                                                info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
+                nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=1.0,
+                                                     info="The initial timestamp cannot be later than this.")
+                cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=False,
+                                                 info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
+                tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value="\"'“¿([{-",
+                                                     info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
+                tb_append_punctuations = gr.Textbox(label="Append Punctuations", value="\"'.。,，!！?？:：”)]}、",
+                                                    info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
+                nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: None, precision=0,
+                                              info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
+                nb_chunk_length = gr.Number(label="Chunk Length", value=lambda: None, precision=0,
+                                            info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
+                nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
+                                                               value=lambda: None,
+                                                               info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
+                tb_hotwords = gr.Textbox(label="Hotwords", value=None,
+                                         info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
+                nb_language_detection_threshold = gr.Number(label="Language Detection Threshold", value=None,
+                                                            info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
+                nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=1, precision=0,
+                                                           info="Number of segments to consider for the language detection.")
+            with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
+                nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
+                nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
+        with gr.Accordion("VAD", open=False):
+            cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
+            sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5,
+                                     info="Lower it to be more sensitive to small sounds.")
+            nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250,
+                                                  info="Final speech chunks shorter than this time are thrown out")
+            nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999,
+                                                 info="Maximum duration of speech chunks in \"seconds\". Chunks longer"
+                                                        " than this time will be split at the timestamp of the last silence that"
+                                                        " lasts more than 100ms (if any), to prevent aggressive cutting.")
+            nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000,
+                                                   info="In the end of each speech chunk wait for this time"
+                                                        " before separating it")
+            nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400,
+                                         info="Final speech chunks are padded by this time each side")
+        with gr.Accordion("Diarization", open=False):
+            cb_diarize = gr.Checkbox(label="Enable Diarization")
+            tb_hf_token = gr.Text(label="HuggingFace Token", value="",
+                                  info="This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
+            dd_diarization_device = gr.Dropdown(label="Device",
+                                                choices=self.whisper_inf.diarizer.get_available_device(),
+                                                value=self.whisper_inf.diarizer.get_device())
+        dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
+        return (
+            WhisperParameters(
+                model_size=dd_model, lang=dd_lang, is_translate=cb_translate, beam_size=nb_beam_size,
+                log_prob_threshold=nb_log_prob_threshold, no_speech_threshold=nb_no_speech_threshold,
+                compute_type=dd_compute_type, best_of=nb_best_of, patience=nb_patience,
+                condition_on_previous_text=cb_condition_on_previous_text, initial_prompt=tb_initial_prompt,
+                temperature=sd_temperature, compression_ratio_threshold=nb_compression_ratio_threshold,
+                vad_filter=cb_vad_filter, threshold=sd_threshold, min_speech_duration_ms=nb_min_speech_duration_ms,
+                max_speech_duration_s=nb_max_speech_duration_s, min_silence_duration_ms=nb_min_silence_duration_ms,
+                speech_pad_ms=nb_speech_pad_ms, chunk_length_s=nb_chunk_length_s, batch_size=nb_batch_size,
+                is_diarize=cb_diarize, hf_token=tb_hf_token, diarization_device=dd_diarization_device,
+                length_penalty=nb_length_penalty, repetition_penalty=nb_repetition_penalty,
+                no_repeat_ngram_size=nb_no_repeat_ngram_size, prefix=tb_prefix, suppress_blank=cb_suppress_blank,
+                suppress_tokens=tb_suppress_tokens, max_initial_timestamp=nb_max_initial_timestamp,
+                word_timestamps=cb_word_timestamps, prepend_punctuations=tb_prepend_punctuations,
+                append_punctuations=tb_append_punctuations, max_new_tokens=nb_max_new_tokens, chunk_length=nb_chunk_length,
+                hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
+                language_detection_threshold=nb_language_detection_threshold,
+                language_detection_segments=nb_language_detection_segments,
+                prompt_reset_on_temperature=sld_prompt_reset_on_temperature
+            ),
+            dd_file_format,
+            cb_timestamp
+        )
     def launch(self):
         with self.app:
                     gr.Markdown(MARKDOWN, elem_id="md_project")
             with gr.Tabs():
                 with gr.TabItem("File"):  # tab1
+                    with gr.Column():
                         input_file = gr.Files(type="filepath", label="Upload File here")
+                        tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
+                                                     info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
+                                                          " Leave this field empty if you do not wish to use a local path.",
+                                                     visible=self.args.colab,
+                                                     value="")
+                    whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
+                        tb_indicator = gr.Textbox(label="Output", scale=5)
                         files_subtitles = gr.Files(label="Downloadable output file", scale=3, interactive=False)
+                        btn_openfolder = gr.Button('📂', scale=1)
+                    params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
+                                  inputs=params + whisper_params.as_list(),
                                   outputs=[tb_indicator, files_subtitles])
+                    btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                 with gr.TabItem("Youtube"):  # tab2
                     with gr.Row():
                         with gr.Column():
                             tb_title = gr.Label(label="Youtube Title")
                             tb_description = gr.Textbox(label="Youtube Description", max_lines=15)
+                    whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
+                        tb_indicator = gr.Textbox(label="Output", scale=5)
                         files_subtitles = gr.Files(label="Downloadable output file", scale=3)
+                        btn_openfolder = gr.Button('📂', scale=1)
                     params = [tb_youtubelink, dd_file_format, cb_timestamp]
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
                                   inputs=params + whisper_params.as_list(),
                                   outputs=[tb_indicator, files_subtitles])
                     tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
                                           outputs=[img_thumbnail, tb_title, tb_description])
+                    btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                 with gr.TabItem("Mic"):  # tab3
                     with gr.Row():
                         mic_input = gr.Microphone(label="Record with Mic", type="filepath", interactive=True)
+                    whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
+                        tb_indicator = gr.Textbox(label="Output", scale=5)
                         files_subtitles = gr.Files(label="Downloadable output file", scale=3)
+                        btn_openfolder = gr.Button('📂', scale=1)
                     params = [mic_input, dd_file_format]
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + whisper_params.as_list(),
                                   outputs=[tb_indicator, files_subtitles])
+                    btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                 with gr.TabItem("T2T Translation"):  # tab 4
                     with gr.Row():
                                                                   self.deepl_api.available_target_langs.keys()))
                         with gr.Row():
                             cb_deepl_ispro = gr.Checkbox(label="Pro User?", value=False)
+                        with gr.Row():
+                            cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
+                                                       interactive=True)
                         with gr.Row():
                             btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
                         with gr.Row():
                             tb_indicator = gr.Textbox(label="Output", scale=5)
                             files_subtitles = gr.Files(label="Downloadable output file", scale=3)
+                            btn_openfolder = gr.Button('📂', scale=1)
                     btn_run.click(fn=self.deepl_api.translate_deepl,
                                   inputs=[tb_authkey, file_subs, dd_deepl_sourcelang, dd_deepl_targetlang,
+                                          cb_deepl_ispro, cb_timestamp],
                                   outputs=[tb_indicator, files_subtitles])
+                    btn_openfolder.click(fn=lambda: self.open_folder(os.path.join("outputs", "translations")),
+                                         inputs=None,
+                                         outputs=None)
                     with gr.TabItem("NLLB"):  # sub tab2
                         with gr.Row():
                             dd_nllb_model = gr.Dropdown(label="Model", value="facebook/nllb-200-1.3B",
                                                              choices=self.nllb_inf.available_source_langs)
                             dd_nllb_targetlang = gr.Dropdown(label="Target Language",
                                                              choices=self.nllb_inf.available_target_langs)
+                        with gr.Row():
+                            nb_max_length = gr.Number(label="Max Length Per Line", value=200, precision=0)
                         with gr.Row():
                             cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                                        interactive=True)
                         with gr.Row():
                             tb_indicator = gr.Textbox(label="Output", scale=5)
                             files_subtitles = gr.Files(label="Downloadable output file", scale=3)
+                            btn_openfolder = gr.Button('📂', scale=1)
                         with gr.Column():
                             md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
                     btn_run.click(fn=self.nllb_inf.translate_file,
+                                  inputs=[file_subs, dd_nllb_model, dd_nllb_sourcelang, dd_nllb_targetlang,
+                                          nb_max_length, cb_timestamp],
                                   outputs=[tb_indicator, files_subtitles])
+                    btn_openfolder.click(fn=lambda: self.open_folder(os.path.join("outputs", "translations")),
+                                         inputs=None,
+                                         outputs=None)
         # Launch the app with optional gradio settings
+        args = self.args
+        self.app.queue(
+            api_open=args.api_open
+        ).launch(
+            share=args.share,
+            server_name=args.server_name,
+            server_port=args.server_port,
+            auth=(args.username, args.password) if args.username and args.password else None,
+            root_path=args.root_path,
+            inbrowser=args.inbrowser
+        )
+    @staticmethod
+    def open_folder(folder_path: str):
+        if os.path.exists(folder_path):
+            os.system(f"start {folder_path}")
+        else:
+            print(f"The folder {folder_path} does not exist.")
+    @staticmethod
+    def on_change_models(model_size: str):
+        translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
+        if model_size not in translatable_model:
+            return gr.Checkbox(visible=False, value=False, interactive=False)
+        else:
+            return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
 # Create the parser for command-line arguments
 parser = argparse.ArgumentParser()
+parser.add_argument('--whisper_type', type=str, default="faster-whisper",
+                    help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]')
 parser.add_argument('--share', type=bool, default=False, nargs='?', const=True, help='Gradio share value')
 parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
 parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
 parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
 parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
 parser.add_argument('--colab', type=bool, default=False, nargs='?', const=True, help='Is colab user or not')
+parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=True, help='Enable api or not in Gradio')
+parser.add_argument('--inbrowser', type=bool, default=True, nargs='?', const=True, help='Whether to automatically start Gradio app or not')
+parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"),
+                    help='Directory path of the whisper model')
+parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"),
+                    help='Directory path of the faster-whisper model')
+parser.add_argument('--insanely_fast_whisper_model_dir', type=str,
+                    default=os.path.join("models", "Whisper", "insanely-fast-whisper"),
+                    help='Directory path of the insanely-fast-whisper model')
+parser.add_argument('--diarization_model_dir', type=str, default=os.path.join("models", "Diarization"),
+                    help='Directory path of the diarization model')
+parser.add_argument('--nllb_model_dir', type=str, default=os.path.join("models", "NLLB"),
+                    help='Directory path of the Facebook NLLB model')
 parser.add_argument('--output_dir', type=str, default=os.path.join("outputs"), help='Directory path of the outputs')
 _args = parser.parse_args()

docker-compose.yaml ADDED Viewed

	@@ -0,0 +1,31 @@

+version: '3.8'
+services:
+  app:
+    build: .
+    image: jhj0517/whisper-webui:latest
+    volumes:
+      # Update paths to mount models and output paths to your custom paths like this, e.g:
+      # - C:/whisper-models/custom-path:/Whisper-WebUI/models
+      # - C:/whisper-webui-outputs/custom-path:/Whisper-WebUI/outputs
+      - /Whisper-WebUI/models
+      - /Whisper-WebUI/outputs
+    ports:
+      - "7860:7860"
+    stdin_open: true
+    tty: true
+    entrypoint: ["python", "app.py", "--server_port", "7860", "--server_name", "0.0.0.0",]
+    # If you're not using nvidia GPU, Update device to match yours.
+    # See more info at : https://docs.docker.com/compose/compose-file/deploy/#driver
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [ gpu ]

modules/diarize/audio_loader.py CHANGED Viewed

@@ -1,7 +1,11 @@
 import os
 import subprocess
 from functools import lru_cache
 from typing import Optional, Union
 import numpy as np
 import torch
@@ -24,32 +28,43 @@ FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH)  # 10ms per audio frame
 TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN)  # 20ms per audio token
-def load_audio(file: str, sr: int = SAMPLE_RATE):
     """
-    Open an audio file and read as mono waveform, resampling as necessary
     Parameters
     ----------
-    file: str
-        The audio file to open
     sr: int
-        The sample rate to resample the audio if necessary
     Returns
     -------
     A NumPy array containing the audio waveform, in float32 dtype.
     """
     try:
-        # Launches a subprocess to decode audio while down-mixing and resampling as necessary.
-        # Requires the ffmpeg CLI to be installed.
         cmd = [
             "ffmpeg",
             "-nostdin",
             "-threads",
             "0",
             "-i",
-            file,
             "-f",
             "s16le",
             "-ac",
@@ -63,6 +78,9 @@ def load_audio(file: str, sr: int = SAMPLE_RATE):
         out = subprocess.run(cmd, capture_output=True, check=True).stdout
     except subprocess.CalledProcessError as e:
         raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
     return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0

+# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/audio.py
 import os
 import subprocess
 from functools import lru_cache
 from typing import Optional, Union
+from scipy.io.wavfile import write
+import tempfile
 import numpy as np
 import torch
 TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN)  # 20ms per audio token
+def load_audio(file: Union[str, np.ndarray], sr: int = SAMPLE_RATE) -> np.ndarray:
     """
+    Open an audio file or process a numpy array containing audio data as mono waveform, resampling as necessary.
     Parameters
     ----------
+    file: Union[str, np.ndarray]
+        The audio file to open or a numpy array containing the audio data.
     sr: int
+        The sample rate to resample the audio if necessary.
     Returns
     -------
     A NumPy array containing the audio waveform, in float32 dtype.
     """
+    if isinstance(file, np.ndarray):
+        if file.dtype != np.float32:
+            file = file.astype(np.float32)
+        if file.ndim > 1:
+            file = np.mean(file, axis=1)
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        write(temp_file.name, SAMPLE_RATE, (file * 32768).astype(np.int16))
+        temp_file_path = temp_file.name
+        temp_file.close()
+    else:
+        temp_file_path = file
     try:
         cmd = [
             "ffmpeg",
             "-nostdin",
             "-threads",
             "0",
             "-i",
+            temp_file_path,
             "-f",
             "s16le",
             "-ac",
         out = subprocess.run(cmd, capture_output=True, check=True).stdout
     except subprocess.CalledProcessError as e:
         raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+    finally:
+        if isinstance(file, np.ndarray):
+            os.remove(temp_file_path)
     return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0

modules/diarize/diarize_pipeline.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import numpy as np
 import pandas as pd
 import os

+# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/diarize.py
 import numpy as np
 import pandas as pd
 import os

modules/diarize/diarizer.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import os
 import torch
-from typing import List
 import time
 import logging
-import spaces
 from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
 from modules.diarize.audio_loader import load_audio
@@ -20,9 +20,8 @@ class Diarizer:
         os.makedirs(self.model_dir, exist_ok=True)
         self.pipe = None
-    @spaces.GPU
     def run(self,
-            audio: str,
             transcribed_result: List[dict],
             use_auth_token: str,
             device: str
@@ -75,7 +74,6 @@ class Diarizer:
         elapsed_time = time.time() - start_time
         return diarized_result["segments"], elapsed_time
-    @spaces.GPU
     def update_pipe(self,
                     use_auth_token: str,
                     device: str
@@ -113,7 +111,6 @@ class Diarizer:
         logger.disabled = False
     @staticmethod
-    @spaces.GPU
     def get_device():
         if torch.cuda.is_available():
             return "cuda"
@@ -123,7 +120,6 @@ class Diarizer:
             return "cpu"
     @staticmethod
-    @spaces.GPU
     def get_available_device():
         devices = ["cpu"]
         if torch.cuda.is_available():

 import os
 import torch
+from typing import List, Union, BinaryIO
+import numpy as np
 import time
 import logging
 from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
 from modules.diarize.audio_loader import load_audio
         os.makedirs(self.model_dir, exist_ok=True)
         self.pipe = None
     def run(self,
+            audio: Union[str, BinaryIO, np.ndarray],
             transcribed_result: List[dict],
             use_auth_token: str,
             device: str
         elapsed_time = time.time() - start_time
         return diarized_result["segments"], elapsed_time
     def update_pipe(self,
                     use_auth_token: str,
                     device: str
         logger.disabled = False
     @staticmethod
     def get_device():
         if torch.cuda.is_available():
             return "cuda"
             return "cpu"
     @staticmethod
     def get_available_device():
         devices = ["cpu"]
         if torch.cuda.is_available():

modules/translation/deepl_api.py CHANGED Viewed

@@ -83,7 +83,7 @@ DEEPL_AVAILABLE_SOURCE_LANGS = {
 class DeepLAPI:
     def __init__(self,
-                 output_dir: str
                  ):
         self.api_interval = 1
         self.max_text_batch_size = 50
@@ -97,6 +97,7 @@ class DeepLAPI:
                         source_lang: str,
                         target_lang: str,
                         is_pro: bool,
                         progress=gr.Progress()) -> list:
         """
         Translate subtitle files using DeepL API
@@ -112,6 +113,8 @@ class DeepLAPI:
             Target language of the file to transcribe from gr.Dropdown()
         is_pro: str
             Boolean value that is about pro user or not from gr.Checkbox().
         progress: gr.Progress
             Indicator to show progress directly in gradio.
@@ -141,11 +144,6 @@ class DeepLAPI:
                     progress(batch_end / len(parsed_dicts), desc="Translating..")
                 subtitle = get_serialized_srt(parsed_dicts)
-                timestamp = datetime.now().strftime("%m%d%H%M%S")
-                file_name = file_name[:-9]
-                output_path = os.path.join(self.output_dir, "", f"{file_name}-{timestamp}.srt")
-                write_file(subtitle, output_path)
             elif file_ext == ".vtt":
                 parsed_dicts = parse_vtt(file_path=file_path)
@@ -161,22 +159,25 @@ class DeepLAPI:
                     progress(batch_end / len(parsed_dicts), desc="Translating..")
                 subtitle = get_serialized_vtt(parsed_dicts)
                 timestamp = datetime.now().strftime("%m%d%H%M%S")
-                file_name = file_name[:-9]
-                output_path = os.path.join(self.output_dir, "", f"{file_name}-{timestamp}.vtt")
-                write_file(subtitle, output_path)
-            files_info[file_name] = subtitle
         total_result = ''
-        for file_name, subtitle in files_info.items():
             total_result += '------------------------------------\n'
             total_result += f'{file_name}\n\n'
-            total_result += f'{subtitle}'
         gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
-        return [gr_str, output_path]
     def request_deepl_translate(self,
                                 auth_key: str,

 class DeepLAPI:
     def __init__(self,
+                 output_dir: str = os.path.join("outputs", "translations")
                  ):
         self.api_interval = 1
         self.max_text_batch_size = 50
                         source_lang: str,
                         target_lang: str,
                         is_pro: bool,
+                        add_timestamp: bool,
                         progress=gr.Progress()) -> list:
         """
         Translate subtitle files using DeepL API
             Target language of the file to transcribe from gr.Dropdown()
         is_pro: str
             Boolean value that is about pro user or not from gr.Checkbox().
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
                     progress(batch_end / len(parsed_dicts), desc="Translating..")
                 subtitle = get_serialized_srt(parsed_dicts)
             elif file_ext == ".vtt":
                 parsed_dicts = parse_vtt(file_path=file_path)
                     progress(batch_end / len(parsed_dicts), desc="Translating..")
                 subtitle = get_serialized_vtt(parsed_dicts)
+            if add_timestamp:
                 timestamp = datetime.now().strftime("%m%d%H%M%S")
+                file_name += f"-{timestamp}"
+            output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
+            write_file(subtitle, output_path)
+            files_info[file_name] = {"subtitle": subtitle, "path": output_path}
         total_result = ''
+        for file_name, info in files_info.items():
             total_result += '------------------------------------\n'
             total_result += f'{file_name}\n\n'
+            total_result += f'{info["subtitle"]}'
         gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
+        output_file_paths = [item["path"] for key, item in files_info.items()]
+        return [gr_str, output_file_paths]
     def request_deepl_translate(self,
                                 auth_key: str,

modules/translation/nllb_inference.py CHANGED Viewed

@@ -1,15 +1,14 @@
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 import gradio as gr
 import os
-import spaces
 from modules.translation.translation_base import TranslationBase
 class NLLBInference(TranslationBase):
     def __init__(self,
-                 model_dir: str,
-                 output_dir: str
                  ):
         super().__init__(
             model_dir=model_dir,
@@ -21,14 +20,16 @@ class NLLBInference(TranslationBase):
         self.available_target_langs = list(NLLB_AVAILABLE_LANGS.keys())
         self.pipeline = None
-    @spaces.GPU(duration=120)
     def translate(self,
-                  text: str
                   ):
-        result = self.pipeline(text)
         return result[0]['translation_text']
-    @spaces.GPU(duration=120)
     def update_model(self,
                      model_size: str,
                      src_lang: str,
@@ -39,10 +40,13 @@ class NLLBInference(TranslationBase):
             print("\nInitializing NLLB Model..\n")
             progress(0, desc="Initializing NLLB Model..")
             self.current_model_size = model_size
             self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=model_size,
-                                                               cache_dir=self.model_dir)
             self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_size,
-                                                           cache_dir=os.path.join(self.model_dir, "tokenizers"))
         src_lang = NLLB_AVAILABLE_LANGS[src_lang]
         tgt_lang = NLLB_AVAILABLE_LANGS[tgt_lang]
         self.pipeline = pipeline("translation",
@@ -52,6 +56,18 @@ class NLLBInference(TranslationBase):
                                  tgt_lang=tgt_lang,
                                  device=self.device)
 NLLB_AVAILABLE_LANGS = {
     "Acehnese (Arabic script)": "ace_Arab",
     "Acehnese (Latin script)": "ace_Latn",

 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 import gradio as gr
 import os
 from modules.translation.translation_base import TranslationBase
 class NLLBInference(TranslationBase):
     def __init__(self,
+                 model_dir: str = os.path.join("models", "NLLB"),
+                 output_dir: str = os.path.join("outputs", "translations")
                  ):
         super().__init__(
             model_dir=model_dir,
         self.available_target_langs = list(NLLB_AVAILABLE_LANGS.keys())
         self.pipeline = None
     def translate(self,
+                  text: str,
+                  max_length: int
                   ):
+        result = self.pipeline(
+            text,
+            max_length=max_length
+        )
         return result[0]['translation_text']
     def update_model(self,
                      model_size: str,
                      src_lang: str,
             print("\nInitializing NLLB Model..\n")
             progress(0, desc="Initializing NLLB Model..")
             self.current_model_size = model_size
+            local_files_only = self.is_model_exists(self.current_model_size)
             self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=model_size,
+                                                               cache_dir=self.model_dir,
+                                                               local_files_only=local_files_only)
             self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_size,
+                                                           cache_dir=os.path.join(self.model_dir, "tokenizers"),
+                                                           local_files_only=local_files_only)
         src_lang = NLLB_AVAILABLE_LANGS[src_lang]
         tgt_lang = NLLB_AVAILABLE_LANGS[tgt_lang]
         self.pipeline = pipeline("translation",
                                  tgt_lang=tgt_lang,
                                  device=self.device)
+    def is_model_exists(self,
+                        model_size: str):
+        """Check if model exists or not (Only facebook model)"""
+        prefix = "models--facebook--"
+        _id, model_size_name = model_size.split("/")
+        model_dir_name = prefix + model_size_name
+        model_dir_path = os.path.join(self.model_dir, model_dir_name)
+        if os.path.exists(model_dir_path) and os.listdir(model_dir_path):
+            return True
+        return False
 NLLB_AVAILABLE_LANGS = {
     "Acehnese (Arabic script)": "ace_Arab",
     "Acehnese (Latin script)": "ace_Latn",

modules/translation/translation_base.py CHANGED Viewed

@@ -4,7 +4,6 @@ import gradio as gr
 from abc import ABC, abstractmethod
 from typing import List
 from datetime import datetime
-import spaces
 from modules.whisper.whisper_parameter import *
 from modules.utils.subtitle_manager import *
@@ -12,8 +11,9 @@ from modules.utils.subtitle_manager import *
 class TranslationBase(ABC):
     def __init__(self,
-                 model_dir: str,
-                 output_dir: str):
         super().__init__()
         self.model = None
         self.model_dir = model_dir
@@ -24,14 +24,13 @@ class TranslationBase(ABC):
         self.device = self.get_device()
     @abstractmethod
-    @spaces.GPU(duration=120)
     def translate(self,
-                  text: str
                   ):
         pass
     @abstractmethod
-    @spaces.GPU(duration=120)
     def update_model(self,
                      model_size: str,
                      src_lang: str,
@@ -40,12 +39,12 @@ class TranslationBase(ABC):
                      ):
         pass
-    @spaces.GPU(duration=120)
     def translate_file(self,
                        fileobjs: list,
                        model_size: str,
                        src_lang: str,
                        tgt_lang: str,
                        add_timestamp: bool,
                        progress=gr.Progress()) -> list:
         """
@@ -61,6 +60,8 @@ class TranslationBase(ABC):
             Source language of the file to translate from gr.Dropdown()
         tgt_lang: str
             Target language of the file to translate from gr.Dropdown()
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
@@ -88,50 +89,44 @@ class TranslationBase(ABC):
                     total_progress = len(parsed_dicts)
                     for index, dic in enumerate(parsed_dicts):
                         progress(index / total_progress, desc="Translating..")
-                        translated_text = self.translate(dic["sentence"])
                         dic["sentence"] = translated_text
                     subtitle = get_serialized_srt(parsed_dicts)
-                    timestamp = datetime.now().strftime("%m%d%H%M%S")
-                    if add_timestamp:
-                        output_path = os.path.join("outputs", "", f"{file_name}-{timestamp}.srt")
-                    else:
-                        output_path = os.path.join("outputs", "", f"{file_name}.srt")
                 elif file_ext == ".vtt":
                     parsed_dicts = parse_vtt(file_path=file_path)
                     total_progress = len(parsed_dicts)
                     for index, dic in enumerate(parsed_dicts):
                         progress(index / total_progress, desc="Translating..")
-                        translated_text = self.translate(dic["sentence"])
                         dic["sentence"] = translated_text
                     subtitle = get_serialized_vtt(parsed_dicts)
                     timestamp = datetime.now().strftime("%m%d%H%M%S")
-                    if add_timestamp:
-                        output_path = os.path.join(self.output_dir, "", f"{file_name}-{timestamp}.vtt")
-                    else:
-                        output_path = os.path.join(self.output_dir, "", f"{file_name}.vtt")
                 write_file(subtitle, output_path)
-                files_info[file_name] = subtitle
             total_result = ''
-            for file_name, subtitle in files_info.items():
                 total_result += '------------------------------------\n'
                 total_result += f'{file_name}\n\n'
-                total_result += f'{subtitle}'
             gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
-            return [gr_str, output_path]
         except Exception as e:
             print(f"Error: {str(e)}")
         finally:
             self.release_cuda_memory()
-            self.remove_input_files([fileobj.name for fileobj in fileobjs])
     @staticmethod
-    @spaces.GPU(duration=120)
     def get_device():
         if torch.cuda.is_available():
             return "cuda"
@@ -141,7 +136,6 @@ class TranslationBase(ABC):
             return "cpu"
     @staticmethod
-    @spaces.GPU(duration=120)
     def release_cuda_memory():
         if torch.cuda.is_available():
             torch.cuda.empty_cache()

 from abc import ABC, abstractmethod
 from typing import List
 from datetime import datetime
 from modules.whisper.whisper_parameter import *
 from modules.utils.subtitle_manager import *
 class TranslationBase(ABC):
     def __init__(self,
+                 model_dir: str = os.path.join("models", "NLLB"),
+                 output_dir: str = os.path.join("outputs", "translations")
+                 ):
         super().__init__()
         self.model = None
         self.model_dir = model_dir
         self.device = self.get_device()
     @abstractmethod
     def translate(self,
+                  text: str,
+                  max_length: int
                   ):
         pass
     @abstractmethod
     def update_model(self,
                      model_size: str,
                      src_lang: str,
                      ):
         pass
     def translate_file(self,
                        fileobjs: list,
                        model_size: str,
                        src_lang: str,
                        tgt_lang: str,
+                       max_length: int,
                        add_timestamp: bool,
                        progress=gr.Progress()) -> list:
         """
             Source language of the file to translate from gr.Dropdown()
         tgt_lang: str
             Target language of the file to translate from gr.Dropdown()
+        max_length: int
+            Max length per line to translate
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
                     total_progress = len(parsed_dicts)
                     for index, dic in enumerate(parsed_dicts):
                         progress(index / total_progress, desc="Translating..")
+                        translated_text = self.translate(dic["sentence"], max_length=max_length)
                         dic["sentence"] = translated_text
                     subtitle = get_serialized_srt(parsed_dicts)
                 elif file_ext == ".vtt":
                     parsed_dicts = parse_vtt(file_path=file_path)
                     total_progress = len(parsed_dicts)
                     for index, dic in enumerate(parsed_dicts):
                         progress(index / total_progress, desc="Translating..")
+                        translated_text = self.translate(dic["sentence"], max_length=max_length)
                         dic["sentence"] = translated_text
                     subtitle = get_serialized_vtt(parsed_dicts)
+                if add_timestamp:
                     timestamp = datetime.now().strftime("%m%d%H%M%S")
+                    file_name += f"-{timestamp}"
+                output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
                 write_file(subtitle, output_path)
+                files_info[file_name] = {"subtitle": subtitle, "path": output_path}
             total_result = ''
+            for file_name, info in files_info.items():
                 total_result += '------------------------------------\n'
                 total_result += f'{file_name}\n\n'
+                total_result += f'{info["subtitle"]}'
             gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
+            output_file_paths = [item["path"] for key, item in files_info.items()]
+            return [gr_str, output_file_paths]
         except Exception as e:
             print(f"Error: {str(e)}")
         finally:
             self.release_cuda_memory()
     @staticmethod
     def get_device():
         if torch.cuda.is_available():
             return "cuda"
             return "cpu"
     @staticmethod
     def release_cuda_memory():
         if torch.cuda.is_available():
             torch.cuda.empty_cache()

modules/utils/files_manager.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+import fnmatch
+from gradio.utils import NamedString
+def get_media_files(folder_path, include_sub_directory=False):
+    video_extensions = ['*.mp4', '*.mkv', '*.flv', '*.avi', '*.mov', '*.wmv']
+    audio_extensions = ['*.mp3', '*.wav', '*.aac', '*.flac', '*.ogg', '*.m4a']
+    media_extensions = video_extensions + audio_extensions
+    media_files = []
+    if include_sub_directory:
+        for root, _, files in os.walk(folder_path):
+            for extension in media_extensions:
+                media_files.extend(
+                    os.path.join(root, file) for file in fnmatch.filter(files, extension)
+                    if os.path.exists(os.path.join(root, file))
+                )
+    else:
+        for extension in media_extensions:
+            media_files.extend(
+                os.path.join(folder_path, file) for file in fnmatch.filter(os.listdir(folder_path), extension)
+                if os.path.isfile(os.path.join(folder_path, file)) and os.path.exists(os.path.join(folder_path, file))
+            )
+    return media_files
+def format_gradio_files(files: list):
+    if not files:
+        return files
+    gradio_files = []
+    for file in files:
+        gradio_files.append(NamedString(file))
+    return gradio_files

modules/utils/subtitle_manager.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import re
-# Zero GPU
-import spaces
 def timeformat_srt(time):
     hours = time // 3600
@@ -119,7 +117,7 @@ def get_serialized_vtt(dicts):
         output += f'{dic["sentence"]}\n\n'
     return output
-@spaces.GPU(duration=120)
 def safe_filename(name):
     from app import _args
     INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'

 import re
 def timeformat_srt(time):
     hours = time // 3600
         output += f'{dic["sentence"]}\n\n'
     return output
 def safe_filename(name):
     from app import _args
     INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'

modules/utils/youtube_manager.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from pytube import YouTube
 import os


1	+ from pytubefix import YouTube
2	import os
3
4

modules/vad/silero_vad.py CHANGED Viewed

@@ -1,21 +1,25 @@
-from faster_whisper.vad import VadOptions
 import numpy as np
-from typing import BinaryIO, Union, List, Optional
 import warnings
 import faster_whisper
 import gradio as gr
-import spaces
 class SileroVAD:
     def __init__(self):
         self.sampling_rate = 16000
-    @spaces.GPU
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
             vad_parameters: VadOptions,
-            progress: gr.Progress = gr.Progress()):
         """
         Run VAD
@@ -30,8 +34,10 @@ class SileroVAD:
         Returns
         ----------
-        audio: np.ndarray
             Pre-processed audio with VAD
         """
         sampling_rate = self.sampling_rate
@@ -54,11 +60,10 @@ class SileroVAD:
         audio = self.collect_chunks(audio, speech_chunks)
         duration_after_vad = audio.shape[0] / sampling_rate
-        return audio
-    @staticmethod
-    @spaces.GPU
     def get_speech_timestamps(
         audio: np.ndarray,
         vad_options: Optional[VadOptions] = None,
         progress: gr.Progress = gr.Progress(),
@@ -75,6 +80,10 @@ class SileroVAD:
         Returns:
           List of dicts containing begin and end samples of each speech chunk.
         """
         if vad_options is None:
             vad_options = VadOptions(**kwargs)
@@ -82,15 +91,8 @@ class SileroVAD:
         min_speech_duration_ms = vad_options.min_speech_duration_ms
         max_speech_duration_s = vad_options.max_speech_duration_s
         min_silence_duration_ms = vad_options.min_silence_duration_ms
-        window_size_samples = vad_options.window_size_samples
         speech_pad_ms = vad_options.speech_pad_ms
-        if window_size_samples not in [512, 1024, 1536]:
-            warnings.warn(
-                "Unusual window_size_samples! Supported window_size_samples:\n"
-                " - [512, 1024, 1536] for 16000 sampling_rate"
-            )
         sampling_rate = 16000
         min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
         speech_pad_samples = sampling_rate * speech_pad_ms / 1000
@@ -104,8 +106,7 @@ class SileroVAD:
         audio_length_samples = len(audio)
-        model = faster_whisper.vad.get_vad_model()
-        state = model.get_initial_state(batch_size=1)
         speech_probs = []
         for current_start_sample in range(0, audio_length_samples, window_size_samples):
@@ -114,7 +115,7 @@ class SileroVAD:
             chunk = audio[current_start_sample: current_start_sample + window_size_samples]
             if len(chunk) < window_size_samples:
                 chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
-            speech_prob, state = model(chunk, state, sampling_rate)
             speech_probs.append(speech_prob)
         triggered = False
@@ -210,6 +211,9 @@ class SileroVAD:
         return speeches
     @staticmethod
     def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
         """Collects and concatenates audio chunks."""
@@ -241,3 +245,20 @@ class SileroVAD:
             f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
         )

+# Adapted from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py
+from faster_whisper.vad import VadOptions, get_vad_model
 import numpy as np
+from typing import BinaryIO, Union, List, Optional, Tuple
 import warnings
 import faster_whisper
+from faster_whisper.transcribe import SpeechTimestampsMap, Segment
 import gradio as gr
 class SileroVAD:
     def __init__(self):
         self.sampling_rate = 16000
+        self.window_size_samples = 512
+        self.model = None
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
             vad_parameters: VadOptions,
+            progress: gr.Progress = gr.Progress()
+            ) -> Tuple[np.ndarray, List[dict]]:
         """
         Run VAD
         Returns
         ----------
+        np.ndarray
             Pre-processed audio with VAD
+        List[dict]
+            Chunks of speeches to be used to restore the timestamps later
         """
         sampling_rate = self.sampling_rate
         audio = self.collect_chunks(audio, speech_chunks)
         duration_after_vad = audio.shape[0] / sampling_rate
+        return audio, speech_chunks
     def get_speech_timestamps(
+        self,
         audio: np.ndarray,
         vad_options: Optional[VadOptions] = None,
         progress: gr.Progress = gr.Progress(),
         Returns:
           List of dicts containing begin and end samples of each speech chunk.
         """
+        if self.model is None:
+            self.update_model()
         if vad_options is None:
             vad_options = VadOptions(**kwargs)
         min_speech_duration_ms = vad_options.min_speech_duration_ms
         max_speech_duration_s = vad_options.max_speech_duration_s
         min_silence_duration_ms = vad_options.min_silence_duration_ms
+        window_size_samples = self.window_size_samples
         speech_pad_ms = vad_options.speech_pad_ms
         sampling_rate = 16000
         min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
         speech_pad_samples = sampling_rate * speech_pad_ms / 1000
         audio_length_samples = len(audio)
+        state, context = self.model.get_initial_states(batch_size=1)
         speech_probs = []
         for current_start_sample in range(0, audio_length_samples, window_size_samples):
             chunk = audio[current_start_sample: current_start_sample + window_size_samples]
             if len(chunk) < window_size_samples:
                 chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
+            speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
             speech_probs.append(speech_prob)
         triggered = False
         return speeches
+    def update_model(self):
+        self.model = get_vad_model()
     @staticmethod
     def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
         """Collects and concatenates audio chunks."""
             f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
         )
+    def restore_speech_timestamps(
+        self,
+        segments: List[dict],
+        speech_chunks: List[dict],
+        sampling_rate: Optional[int] = None,
+    ) -> List[dict]:
+        if sampling_rate is None:
+            sampling_rate = self.sampling_rate
+        ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
+        for segment in segments:
+            segment["start"] = ts_map.get_original_time(segment["start"])
+            segment["end"] = ts_map.get_original_time(segment["end"])
+        return segments

modules/whisper/faster_whisper_inference.py CHANGED Viewed

@@ -5,6 +5,7 @@ import torch
 from typing import BinaryIO, Union, Tuple, List
 import faster_whisper
 from faster_whisper.vad import VadOptions
 import ctranslate2
 import whisper
 import gradio as gr
@@ -13,31 +14,31 @@ from argparse import Namespace
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
-import spaces
 class FasterWhisperInference(WhisperBase):
     def __init__(self,
-                 model_dir: str,
-                 output_dir: str,
-                 args: Namespace
                  ):
         super().__init__(
             model_dir=model_dir,
-            output_dir=output_dir,
-            args=args
         )
         self.model_paths = self.get_model_paths()
         self.device = self.get_device()
         self.available_models = self.model_paths.keys()
-        self.available_compute_types = ["float32"] # spaces bug
-        self.current_compute_type = "float32" # spaces bug
-        self.download_model(model_size="large-v2", model_dir=self.model_dir)
-    #@spaces.GPU(duration=120)
     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
                    *whisper_params,
-                   progress: gr.Progress = gr.Progress(),
                    ) -> Tuple[List[dict], float]:
         """
         transcribe method for faster-whisper.
@@ -65,7 +66,16 @@ class FasterWhisperInference(WhisperBase):
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)
-        print("transcribe:")
         segments, info = self.model.transcribe(
             audio=audio,
             language=params.lang,
@@ -76,7 +86,25 @@ class FasterWhisperInference(WhisperBase):
             best_of=params.best_of,
             patience=params.patience,
             temperature=params.temperature,
             compression_ratio_threshold=params.compression_ratio_threshold,
         )
         progress(0, desc="Loading audio..")
@@ -90,14 +118,12 @@ class FasterWhisperInference(WhisperBase):
             })
         elapsed_time = time.time() - start_time
-        print("transcribe: finished")
         return segments_result, elapsed_time
-    #@spaces.GPU(duration=120)
     def update_model(self,
                      model_size: str,
                      compute_type: str,
-                     progress: gr.Progress = gr.Progress(),
                      ):
         """
         Update current model setting
@@ -113,7 +139,6 @@ class FasterWhisperInference(WhisperBase):
             Indicator to show progress directly in gradio.
         """
         progress(0, desc="Initializing Model..")
-        print("update_model:")
         self.current_model_size = self.model_paths[model_size]
         self.current_compute_type = compute_type
         self.model = faster_whisper.WhisperModel(
@@ -122,7 +147,6 @@ class FasterWhisperInference(WhisperBase):
             download_root=self.model_dir,
             compute_type=self.current_compute_type
         )
-        print("update_model: finished")
     def get_model_paths(self):
         """
@@ -149,22 +173,19 @@ class FasterWhisperInference(WhisperBase):
                 model_paths[model_name] = os.path.join(webui_dir, self.model_dir, model_name)
         return model_paths
-    def get_available_compute_type(self):
-        if self.device == "cuda":
-            return ['float32', 'int8_float16', 'float16', 'int8', 'int8_float32']
-        return ['int16', 'float32', 'int8', 'int8_float32']
-    def get_device(self):
-        # Because of huggingface spaces bug, just return cpu
-        return "cpu"
     @staticmethod
-    def download_model(model_size: str, model_dir: str):
-        print(f"\nDownloading \"{model_size}\" to \"{model_dir}\"..\n")
-        os.makedirs(model_dir, exist_ok=True)
-        faster_whisper.download_model(
-            size_or_id=model_size,
-            cache_dir=model_dir
-        )

 from typing import BinaryIO, Union, Tuple, List
 import faster_whisper
 from faster_whisper.vad import VadOptions
+import ast
 import ctranslate2
 import whisper
 import gradio as gr
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
 class FasterWhisperInference(WhisperBase):
     def __init__(self,
+                 model_dir: str = os.path.join("models", "Whisper", "faster-whisper"),
+                 diarization_model_dir: str = os.path.join("models", "Diarization"),
+                 output_dir: str = os.path.join("outputs"),
                  ):
         super().__init__(
             model_dir=model_dir,
+            diarization_model_dir=diarization_model_dir,
+            output_dir=output_dir
         )
+        self.model_dir = model_dir
+        os.makedirs(self.model_dir, exist_ok=True)
         self.model_paths = self.get_model_paths()
         self.device = self.get_device()
         self.available_models = self.model_paths.keys()
+        self.available_compute_types = ctranslate2.get_supported_compute_types(
+            "cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
+                   progress: gr.Progress,
                    *whisper_params,
                    ) -> Tuple[List[dict], float]:
         """
         transcribe method for faster-whisper.
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)
+        # None parameters with Textboxes: https://github.com/gradio-app/gradio/issues/8723
+        if not params.initial_prompt:
+            params.initial_prompt = None
+        if not params.prefix:
+            params.prefix = None
+        if not params.hotwords:
+            params.hotwords = None
+        params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
         segments, info = self.model.transcribe(
             audio=audio,
             language=params.lang,
             best_of=params.best_of,
             patience=params.patience,
             temperature=params.temperature,
+            initial_prompt=params.initial_prompt,
             compression_ratio_threshold=params.compression_ratio_threshold,
+            length_penalty=params.length_penalty,
+            repetition_penalty=params.repetition_penalty,
+            no_repeat_ngram_size=params.no_repeat_ngram_size,
+            prefix=params.prefix,
+            suppress_blank=params.suppress_blank,
+            suppress_tokens=params.suppress_tokens,
+            max_initial_timestamp=params.max_initial_timestamp,
+            word_timestamps=params.word_timestamps,
+            prepend_punctuations=params.prepend_punctuations,
+            append_punctuations=params.append_punctuations,
+            max_new_tokens=params.max_new_tokens,
+            chunk_length=params.chunk_length,
+            hallucination_silence_threshold=params.hallucination_silence_threshold,
+            hotwords=params.hotwords,
+            language_detection_threshold=params.language_detection_threshold,
+            language_detection_segments=params.language_detection_segments,
+            prompt_reset_on_temperature=params.prompt_reset_on_temperature,
         )
         progress(0, desc="Loading audio..")
             })
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
     def update_model(self,
                      model_size: str,
                      compute_type: str,
+                     progress: gr.Progress
                      ):
         """
         Update current model setting
             Indicator to show progress directly in gradio.
         """
         progress(0, desc="Initializing Model..")
         self.current_model_size = self.model_paths[model_size]
         self.current_compute_type = compute_type
         self.model = faster_whisper.WhisperModel(
             download_root=self.model_dir,
             compute_type=self.current_compute_type
         )
     def get_model_paths(self):
         """
                 model_paths[model_name] = os.path.join(webui_dir, self.model_dir, model_name)
         return model_paths
     @staticmethod
+    def get_device():
+        if torch.cuda.is_available():
+            return "cuda"
+        else:
+            return "auto"
+    @staticmethod
+    def format_suppress_tokens_str(suppress_tokens_str: str) -> List[int]:
+        try:
+            suppress_tokens = ast.literal_eval(suppress_tokens_str)
+            if not isinstance(suppress_tokens, list) or not all(isinstance(item, int) for item in suppress_tokens):
+                raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
+            return suppress_tokens
+        except Exception as e:
+            raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")

modules/whisper/insanely_fast_whisper_inference.py CHANGED Viewed

@@ -17,15 +17,18 @@ from modules.whisper.whisper_base import WhisperBase
 class InsanelyFastWhisperInference(WhisperBase):
     def __init__(self,
-                 model_dir: str,
-                 output_dir: str,
-                 args: Namespace
                  ):
         super().__init__(
             model_dir=model_dir,
             output_dir=output_dir,
-            args=args
         )
         openai_models = whisper.available_models()
         distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
         self.available_models = openai_models + distil_models

 class InsanelyFastWhisperInference(WhisperBase):
     def __init__(self,
+                 model_dir: str = os.path.join("models", "Whisper", "insanely-fast-whisper"),
+                 diarization_model_dir: str = os.path.join("models", "Diarization"),
+                 output_dir: str = os.path.join("outputs"),
                  ):
         super().__init__(
             model_dir=model_dir,
             output_dir=output_dir,
+            diarization_model_dir=diarization_model_dir
         )
+        self.model_dir = model_dir
+        os.makedirs(self.model_dir, exist_ok=True)
         openai_models = whisper.available_models()
         distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
         self.available_models = openai_models + distil_models

modules/whisper/whisper_Inference.py CHANGED Viewed

@@ -4,6 +4,7 @@ import time
 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 import torch
 from argparse import Namespace
 from modules.whisper.whisper_base import WhisperBase
@@ -12,14 +13,14 @@ from modules.whisper.whisper_parameter import *
 class WhisperInference(WhisperBase):
     def __init__(self,
-                 model_dir: str,
-                 output_dir: str,
-                 args: Namespace
                  ):
         super().__init__(
             model_dir=model_dir,
             output_dir=output_dir,
-            args=args
         )
     def transcribe(self,

 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 import torch
+import os
 from argparse import Namespace
 from modules.whisper.whisper_base import WhisperBase
 class WhisperInference(WhisperBase):
     def __init__(self,
+                 model_dir: str = os.path.join("models", "Whisper"),
+                 diarization_model_dir: str = os.path.join("models", "Diarization"),
+                 output_dir: str = os.path.join("outputs"),
                  ):
         super().__init__(
             model_dir=model_dir,
             output_dir=output_dir,
+            diarization_model_dir=diarization_model_dir
         )
     def transcribe(self,

modules/whisper/whisper_base.py CHANGED Viewed

@@ -6,13 +6,12 @@ from abc import ABC, abstractmethod
 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 from datetime import datetime
-from argparse import Namespace
 from faster_whisper.vad import VadOptions
 from dataclasses import astuple
-import spaces
 from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.utils.youtube_manager import get_ytdata, get_ytaudio
 from modules.whisper.whisper_parameter import *
 from modules.diarize.diarizer import Diarizer
 from modules.vad.silero_vad import SileroVAD
@@ -20,51 +19,50 @@ from modules.vad.silero_vad import SileroVAD
 class WhisperBase(ABC):
     def __init__(self,
-                 model_dir: str,
-                 output_dir: str,
-                 args: Namespace
                  ):
-        self.model = None
-        self.current_model_size = None
         self.model_dir = model_dir
         self.output_dir = output_dir
         os.makedirs(self.output_dir, exist_ok=True)
         os.makedirs(self.model_dir, exist_ok=True)
         self.available_models = whisper.available_models()
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
         self.device = self.get_device()
         self.available_compute_types = ["float16", "float32"]
         self.current_compute_type = "float16" if self.device == "cuda" else "float32"
-        self.diarizer = Diarizer(
-            model_dir=args.diarization_model_dir
-        )
-        self.vad = SileroVAD()
     @abstractmethod
-    #@spaces.GPU(duration=120)
     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
                    *whisper_params,
-                   progress: gr.Progress = gr.Progress(),
                    ):
         pass
     @abstractmethod
-    @spaces.GPU(duration=120)
     def update_model(self,
                      model_size: str,
                      compute_type: str,
-                     progress: gr.Progress = gr.Progress(),
                      ):
         pass
-    # spaces is problematic
-    #@spaces.GPU(duration=120)
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
             *whisper_params,
-            progress: gr.Progress = gr.Progress(),
             ) -> Tuple[List[dict], float]:
         """
         Run transcription with conditional pre-processing and post-processing.
@@ -89,33 +87,44 @@ class WhisperBase(ABC):
         """
         params = WhisperParameters.as_value(*whisper_params)
         if params.vad_filter:
             vad_options = VadOptions(
                 threshold=params.threshold,
                 min_speech_duration_ms=params.min_speech_duration_ms,
                 max_speech_duration_s=params.max_speech_duration_s,
                 min_silence_duration_ms=params.min_silence_duration_ms,
-                window_size_samples=params.window_size_samples,
                 speech_pad_ms=params.speech_pad_ms
             )
-            self.vad.run(
                 audio=audio,
                 vad_parameters=vad_options,
                 progress=progress
             )
-        if params.lang == "Automatic Detection":
-            params.lang = None
-        else:
-            language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
-            params.lang = language_code_dict[params.lang]
         result, elapsed_time = self.transcribe(
             audio,
-            *astuple(params),
-            progress=progress
         )
         if params.is_diarize:
             result, elapsed_time_diarization = self.diarizer.run(
                 audio=audio,
@@ -126,15 +135,14 @@ class WhisperBase(ABC):
             elapsed_time += elapsed_time_diarization
         return result, elapsed_time
-    # spaces is problematic
-    #@spaces.GPU(duration=120)
     def transcribe_file(self,
-                        files,
-                        file_format,
-                        add_timestamp,
-                        *whisper_params,
                         progress=gr.Progress(),
-                        ):
         """
         Write subtitle file from Files
@@ -142,6 +150,9 @@ class WhisperBase(ABC):
         ----------
         files: list
             List of files to transcribe from gr.Files()
         file_format: str
             Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
         add_timestamp: bool
@@ -159,16 +170,19 @@ class WhisperBase(ABC):
             Output file path to return to gr.Files()
         """
         try:
             files_info = {}
             for file in files:
                 transcribed_segments, time_for_task = self.run(
                     file.name,
                     *whisper_params,
-                    progress=progress
                 )
                 file_name, file_ext = os.path.splitext(os.path.basename(file.name))
-                file_name = safe_filename(file_name)
                 subtitle, file_path = self.generate_and_write_file(
                     file_name=file_name,
                     transcribed_segments=transcribed_segments,
@@ -176,7 +190,6 @@ class WhisperBase(ABC):
                     file_format=file_format,
                     output_dir=self.output_dir
                 )
-                print("generated sub finished: ")
                 files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path": file_path}
             total_result = ''
@@ -195,16 +208,15 @@ class WhisperBase(ABC):
         except Exception as e:
             print(f"Error transcribing file: {e}")
         finally:
-            # self.release_cuda_memory()
             if not files:
                 self.remove_input_files([file.name for file in files])
-    #@spaces.GPU(duration=120)
     def transcribe_mic(self,
                        mic_audio: str,
                        file_format: str,
                        *whisper_params,
-                       progress: gr.Progress = gr.Progress(),
                        ) -> list:
         """
         Write subtitle file from microphone
@@ -231,8 +243,8 @@ class WhisperBase(ABC):
             progress(0, desc="Loading Audio..")
             transcribed_segments, time_for_task = self.run(
                 mic_audio,
                 *whisper_params,
-                progress=progress
             )
             progress(1, desc="Completed!")
@@ -252,13 +264,12 @@ class WhisperBase(ABC):
             self.release_cuda_memory()
             self.remove_input_files([mic_audio])
-    #@spaces.GPU(duration=120)
     def transcribe_youtube(self,
                            youtube_link: str,
                            file_format: str,
                            add_timestamp: bool,
                            *whisper_params,
-                           progress: gr.Progress = gr.Progress(),
                            ) -> list:
         """
         Write subtitle file from Youtube
@@ -290,8 +301,8 @@ class WhisperBase(ABC):
             transcribed_segments, time_for_task = self.run(
                 audio,
                 *whisper_params,
-                progress=progress
             )
             progress(1, desc="Completed!")
@@ -318,13 +329,12 @@ class WhisperBase(ABC):
                 else:
                     file_path = get_ytaudio(yt)
-                #self.release_cuda_memory()
                 self.remove_input_files([file_path])
             except Exception as cleanup_error:
                 pass
     @staticmethod
-    @spaces.GPU(duration=120)
     def generate_and_write_file(file_name: str,
                                 transcribed_segments: list,
                                 add_timestamp: bool,
@@ -354,8 +364,8 @@ class WhisperBase(ABC):
         output_path: str
             output file path
         """
-        timestamp = datetime.now().strftime("%m%d%H%M%S")
         if add_timestamp:
             output_path = os.path.join(output_dir, f"{file_name}-{timestamp}")
         else:
             output_path = os.path.join(output_dir, f"{file_name}")
@@ -363,17 +373,16 @@ class WhisperBase(ABC):
         if file_format == "SRT":
             content = get_srt(transcribed_segments)
             output_path += '.srt'
-            write_file(content, output_path)
         elif file_format == "WebVTT":
             content = get_vtt(transcribed_segments)
             output_path += '.vtt'
-            write_file(content, output_path)
         elif file_format == "txt":
             content = get_txt(transcribed_segments)
             output_path += '.txt'
-            write_file(content, output_path)
         return content, output_path
     @staticmethod
@@ -403,12 +412,6 @@ class WhisperBase(ABC):
         return time_str.strip()
-    @staticmethod
-    def release_cuda_memory():
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.reset_max_memory_allocated()
     @staticmethod
     def get_device():
         if torch.cuda.is_available():
@@ -418,6 +421,12 @@ class WhisperBase(ABC):
         else:
             return "cpu"
     @staticmethod
     def remove_input_files(file_paths: List[str]):
         if not file_paths:

 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 from datetime import datetime
 from faster_whisper.vad import VadOptions
 from dataclasses import astuple
 from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.utils.youtube_manager import get_ytdata, get_ytaudio
+from modules.utils.files_manager import get_media_files, format_gradio_files
 from modules.whisper.whisper_parameter import *
 from modules.diarize.diarizer import Diarizer
 from modules.vad.silero_vad import SileroVAD
 class WhisperBase(ABC):
     def __init__(self,
+                 model_dir: str = os.path.join("models", "Whisper"),
+                 diarization_model_dir: str = os.path.join("models", "Diarization"),
+                 output_dir: str = os.path.join("outputs"),
                  ):
         self.model_dir = model_dir
         self.output_dir = output_dir
         os.makedirs(self.output_dir, exist_ok=True)
         os.makedirs(self.model_dir, exist_ok=True)
+        self.diarizer = Diarizer(
+            model_dir=diarization_model_dir
+        )
+        self.vad = SileroVAD()
+        self.model = None
+        self.current_model_size = None
         self.available_models = whisper.available_models()
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
         self.device = self.get_device()
         self.available_compute_types = ["float16", "float32"]
         self.current_compute_type = "float16" if self.device == "cuda" else "float32"
     @abstractmethod
     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
+                   progress: gr.Progress,
                    *whisper_params,
                    ):
+        """Inference whisper model to transcribe"""
         pass
     @abstractmethod
     def update_model(self,
                      model_size: str,
                      compute_type: str,
+                     progress: gr.Progress
                      ):
+        """Initialize whisper model"""
         pass
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
+            progress: gr.Progress,
             *whisper_params,
             ) -> Tuple[List[dict], float]:
         """
         Run transcription with conditional pre-processing and post-processing.
         """
         params = WhisperParameters.as_value(*whisper_params)
+        if params.lang == "Automatic Detection":
+            params.lang = None
+        else:
+            language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
+            params.lang = language_code_dict[params.lang]
+        speech_chunks = None
         if params.vad_filter:
+            # Explicit value set for float('inf') from gr.Number()
+            if params.max_speech_duration_s >= 9999:
+                params.max_speech_duration_s = float('inf')
             vad_options = VadOptions(
                 threshold=params.threshold,
                 min_speech_duration_ms=params.min_speech_duration_ms,
                 max_speech_duration_s=params.max_speech_duration_s,
                 min_silence_duration_ms=params.min_silence_duration_ms,
                 speech_pad_ms=params.speech_pad_ms
             )
+            audio, speech_chunks = self.vad.run(
                 audio=audio,
                 vad_parameters=vad_options,
                 progress=progress
             )
         result, elapsed_time = self.transcribe(
             audio,
+            progress,
+            *astuple(params)
         )
+        if params.vad_filter:
+            result = self.vad.restore_speech_timestamps(
+                segments=result,
+                speech_chunks=speech_chunks,
+            )
         if params.is_diarize:
             result, elapsed_time_diarization = self.diarizer.run(
                 audio=audio,
             elapsed_time += elapsed_time_diarization
         return result, elapsed_time
     def transcribe_file(self,
+                        files: list,
+                        input_folder_path: str,
+                        file_format: str,
+                        add_timestamp: bool,
                         progress=gr.Progress(),
+                        *whisper_params,
+                        ) -> list:
         """
         Write subtitle file from Files
         ----------
         files: list
             List of files to transcribe from gr.Files()
+        input_folder_path: str
+            Input folder path to transcribe from gr.Textbox(). If this is provided, `files` will be ignored and
+            this will be used instead.
         file_format: str
             Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
         add_timestamp: bool
             Output file path to return to gr.Files()
         """
         try:
+            if input_folder_path:
+                files = get_media_files(input_folder_path)
+                files = format_gradio_files(files)
             files_info = {}
             for file in files:
                 transcribed_segments, time_for_task = self.run(
                     file.name,
+                    progress,
                     *whisper_params,
                 )
                 file_name, file_ext = os.path.splitext(os.path.basename(file.name))
                 subtitle, file_path = self.generate_and_write_file(
                     file_name=file_name,
                     transcribed_segments=transcribed_segments,
                     file_format=file_format,
                     output_dir=self.output_dir
                 )
                 files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path": file_path}
             total_result = ''
         except Exception as e:
             print(f"Error transcribing file: {e}")
         finally:
+            self.release_cuda_memory()
             if not files:
                 self.remove_input_files([file.name for file in files])
     def transcribe_mic(self,
                        mic_audio: str,
                        file_format: str,
+                       progress=gr.Progress(),
                        *whisper_params,
                        ) -> list:
         """
         Write subtitle file from microphone
             progress(0, desc="Loading Audio..")
             transcribed_segments, time_for_task = self.run(
                 mic_audio,
+                progress,
                 *whisper_params,
             )
             progress(1, desc="Completed!")
             self.release_cuda_memory()
             self.remove_input_files([mic_audio])
     def transcribe_youtube(self,
                            youtube_link: str,
                            file_format: str,
                            add_timestamp: bool,
+                           progress=gr.Progress(),
                            *whisper_params,
                            ) -> list:
         """
         Write subtitle file from Youtube
             transcribed_segments, time_for_task = self.run(
                 audio,
+                progress,
                 *whisper_params,
             )
             progress(1, desc="Completed!")
                 else:
                     file_path = get_ytaudio(yt)
+                self.release_cuda_memory()
                 self.remove_input_files([file_path])
             except Exception as cleanup_error:
                 pass
     @staticmethod
     def generate_and_write_file(file_name: str,
                                 transcribed_segments: list,
                                 add_timestamp: bool,
         output_path: str
             output file path
         """
         if add_timestamp:
+            timestamp = datetime.now().strftime("%m%d%H%M%S")
             output_path = os.path.join(output_dir, f"{file_name}-{timestamp}")
         else:
             output_path = os.path.join(output_dir, f"{file_name}")
         if file_format == "SRT":
             content = get_srt(transcribed_segments)
             output_path += '.srt'
         elif file_format == "WebVTT":
             content = get_vtt(transcribed_segments)
             output_path += '.vtt'
         elif file_format == "txt":
             content = get_txt(transcribed_segments)
             output_path += '.txt'
+        write_file(content, output_path)
         return content, output_path
     @staticmethod
         return time_str.strip()
     @staticmethod
     def get_device():
         if torch.cuda.is_available():
         else:
             return "cpu"
+    @staticmethod
+    def release_cuda_memory():
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.reset_max_memory_allocated()
     @staticmethod
     def remove_input_files(file_paths: List[str]):
         if not file_paths:

modules/whisper/whisper_factory.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from typing import Optional
+import os
+from modules.whisper.faster_whisper_inference import FasterWhisperInference
+from modules.whisper.whisper_Inference import WhisperInference
+from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
+from modules.whisper.whisper_base import WhisperBase
+class WhisperFactory:
+    @staticmethod
+    def create_whisper_inference(
+        whisper_type: str,
+        whisper_model_dir: str = os.path.join("models", "Whisper"),
+        faster_whisper_model_dir: str = os.path.join("models", "Whisper", "faster-whisper"),
+        insanely_fast_whisper_model_dir: str = os.path.join("models", "Whisper", "insanely-fast-whisper"),
+        diarization_model_dir: str = os.path.join("models", "Diarization"),
+        output_dir: str = os.path.join("outputs"),
+    ) -> "WhisperBase":
+        """
+        Create a whisper inference class based on the provided whisper_type.
+        Parameters
+        ----------
+        whisper_type : str
+            The type of Whisper implementation to use. Supported values (case-insensitive):
+            - "faster-whisper": https://github.com/openai/whisper
+            - "whisper": https://github.com/openai/whisper
+            - "insanely-fast-whisper": https://github.com/Vaibhavs10/insanely-fast-whisper
+        whisper_model_dir : str
+            Directory path for the Whisper model.
+        faster_whisper_model_dir : str
+            Directory path for the Faster Whisper model.
+        insanely_fast_whisper_model_dir : str
+            Directory path for the Insanely Fast Whisper model.
+        diarization_model_dir : str
+            Directory path for the diarization model.
+        output_dir : str
+            Directory path where output files will be saved.
+        Returns
+        -------
+        WhisperBase
+            An instance of the appropriate whisper inference class based on the whisper_type.
+        """
+        # Temporal fix of the bug : https://github.com/jhj0517/Whisper-WebUI/issues/144
+        os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
+        whisper_type = whisper_type.lower().strip()
+        faster_whisper_typos = ["faster_whisper", "faster-whisper", "fasterwhisper"]
+        whisper_typos = ["whisper"]
+        insanely_fast_whisper_typos = [
+            "insanely_fast_whisper", "insanely-fast-whisper", "insanelyfastwhisper",
+            "insanely_faster_whisper", "insanely-faster-whisper", "insanelyfasterwhisper"
+        ]
+        if whisper_type in faster_whisper_typos:
+            return FasterWhisperInference(
+                model_dir=faster_whisper_model_dir,
+                output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir
+            )
+        elif whisper_type in whisper_typos:
+            return WhisperInference(
+                model_dir=whisper_model_dir,
+                output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir
+            )
+        elif whisper_type in insanely_fast_whisper_typos:
+            return InsanelyFastWhisperInference(
+                model_dir=insanely_fast_whisper_model_dir,
+                output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir
+            )
+        else:
+            return FasterWhisperInference(
+                model_dir=faster_whisper_model_dir,
+                output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir
+            )

modules/whisper/whisper_parameter.py CHANGED Viewed

@@ -15,6 +15,7 @@ class WhisperParameters:
     best_of: gr.Number
     patience: gr.Number
     condition_on_previous_text: gr.Checkbox
     initial_prompt: gr.Textbox
     temperature: gr.Slider
     compression_ratio_threshold: gr.Number
@@ -23,13 +24,28 @@ class WhisperParameters:
     min_speech_duration_ms: gr.Number
     max_speech_duration_s: gr.Number
     min_silence_duration_ms: gr.Number
-    window_size_sample: gr.Number
     speech_pad_ms: gr.Number
     chunk_length_s: gr.Number
     batch_size: gr.Number
     is_diarize: gr.Checkbox
     hf_token: gr.Textbox
     diarization_device: gr.Dropdown
     """
     A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
     This data class is used to mitigate the key-value problem between Gradio components and function parameters.
@@ -111,11 +127,6 @@ class WhisperParameters:
         This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
         before separating it
-    window_size_samples: gr.Number
-        This parameter is related with Silero VAD. Audio chunks of window_size_samples size are fed to the silero VAD model.
-        WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate.
-        Values other than these may affect model performance!!
     speech_pad_ms: gr.Number
         This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
@@ -135,6 +146,62 @@ class WhisperParameters:
     diarization_device: gr.Dropdown
         This parameter is related with whisperx. Device to run diarization model
     """
     def as_list(self) -> list:
@@ -159,33 +226,7 @@ class WhisperParameters:
         WhisperValues
            Data class that has values of parameters
         """
-        return WhisperValues(
-            model_size=args[0],
-            lang=args[1],
-            is_translate=args[2],
-            beam_size=args[3],
-            log_prob_threshold=args[4],
-            no_speech_threshold=args[5],
-            compute_type=args[6],
-            best_of=args[7],
-            patience=args[8],
-            condition_on_previous_text=args[9],
-            initial_prompt=args[10],
-            temperature=args[11],
-            compression_ratio_threshold=args[12],
-            vad_filter=args[13],
-            threshold=args[14],
-            min_speech_duration_ms=args[15],
-            max_speech_duration_s=args[16],
-            min_silence_duration_ms=args[17],
-            window_size_samples=args[18],
-            speech_pad_ms=args[19],
-            chunk_length_s=args[20],
-            batch_size=args[21],
-            is_diarize=args[22],
-            hf_token=args[23],
-            diarization_device=args[24]
-        )
 @dataclass
@@ -200,6 +241,7 @@ class WhisperValues:
     best_of: int
     patience: float
     condition_on_previous_text: bool
     initial_prompt: Optional[str]
     temperature: float
     compression_ratio_threshold: float
@@ -208,13 +250,28 @@ class WhisperValues:
     min_speech_duration_ms: int
     max_speech_duration_s: float
     min_silence_duration_ms: int
-    window_size_samples: int
     speech_pad_ms: int
     chunk_length_s: int
     batch_size: int
     is_diarize: bool
     hf_token: str
     diarization_device: str
     """
     A data class to use Whisper parameters.
-    """

     best_of: gr.Number
     patience: gr.Number
     condition_on_previous_text: gr.Checkbox
+    prompt_reset_on_temperature: gr.Slider
     initial_prompt: gr.Textbox
     temperature: gr.Slider
     compression_ratio_threshold: gr.Number
     min_speech_duration_ms: gr.Number
     max_speech_duration_s: gr.Number
     min_silence_duration_ms: gr.Number
     speech_pad_ms: gr.Number
     chunk_length_s: gr.Number
     batch_size: gr.Number
     is_diarize: gr.Checkbox
     hf_token: gr.Textbox
     diarization_device: gr.Dropdown
+    length_penalty: gr.Number
+    repetition_penalty: gr.Number
+    no_repeat_ngram_size: gr.Number
+    prefix: gr.Textbox
+    suppress_blank: gr.Checkbox
+    suppress_tokens: gr.Textbox
+    max_initial_timestamp: gr.Number
+    word_timestamps: gr.Checkbox
+    prepend_punctuations: gr.Textbox
+    append_punctuations: gr.Textbox
+    max_new_tokens: gr.Number
+    chunk_length: gr.Number
+    hallucination_silence_threshold: gr.Number
+    hotwords: gr.Textbox
+    language_detection_threshold: gr.Number
+    language_detection_segments: gr.Number
     """
     A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
     This data class is used to mitigate the key-value problem between Gradio components and function parameters.
         This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
         before separating it
     speech_pad_ms: gr.Number
         This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
     diarization_device: gr.Dropdown
         This parameter is related with whisperx. Device to run diarization model
+    length_penalty:
+        This parameter is related to faster-whisper. Exponential length penalty constant.
+    repetition_penalty:
+        This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
+        (set > 1 to penalize).
+    no_repeat_ngram_size:
+        This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
+    prefix:
+        This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
+    suppress_blank:
+        This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
+    suppress_tokens:
+        This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
+        of symbols as defined in the model config.json file.
+    max_initial_timestamp:
+        This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
+    word_timestamps:
+        This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
+        and dynamic time warping, and include the timestamps for each word in each segment.
+    prepend_punctuations:
+        This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
+        with the next word.
+    append_punctuations:
+        This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
+        with the previous word.
+    max_new_tokens:
+        This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
+        the maximum will be set by the default max_length.
+    chunk_length:
+        This parameter is related to faster-whisper. The length of audio segments. If it is not None, it will overwrite the
+        default chunk_length of the FeatureExtractor.
+    hallucination_silence_threshold:
+        This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
+        (in seconds) when a possible hallucination is detected.
+    hotwords:
+        This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
+    language_detection_threshold:
+        This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
+    language_detection_segments:
+        This parameter is related to faster-whisper. Number of segments to consider for the language detection.
     """
     def as_list(self) -> list:
         WhisperValues
            Data class that has values of parameters
         """
+        return WhisperValues(*args)
 @dataclass
     best_of: int
     patience: float
     condition_on_previous_text: bool
+    prompt_reset_on_temperature: float
     initial_prompt: Optional[str]
     temperature: float
     compression_ratio_threshold: float
     min_speech_duration_ms: int
     max_speech_duration_s: float
     min_silence_duration_ms: int
     speech_pad_ms: int
     chunk_length_s: int
     batch_size: int
     is_diarize: bool
     hf_token: str
     diarization_device: str
+    length_penalty: float
+    repetition_penalty: float
+    no_repeat_ngram_size: int
+    prefix: Optional[str]
+    suppress_blank: bool
+    suppress_tokens: Optional[str]
+    max_initial_timestamp: float
+    word_timestamps: bool
+    prepend_punctuations: Optional[str]
+    append_punctuations: Optional[str]
+    max_new_tokens: Optional[int]
+    chunk_length: Optional[int]
+    hallucination_silence_threshold: Optional[float]
+    hotwords: Optional[str]
+    language_detection_threshold: Optional[float]
+    language_detection_segments: int
     """
     A data class to use Whisper parameters.
+    """

notebook/whisper-webui.ipynb CHANGED Viewed

@@ -13,7 +13,7 @@
         "\n",
         "If you find this project useful, please consider supporting it:\n",
         "\n",
-        "<a href=\"https://ko-fi.com/A0A7JSQRJ\" target=\"_blank\">\n",
         "    <img src=\"https://storage.ko-fi.com/cdn/kofi2.png?v=3\" alt=\"Buy Me a Coffee at ko-fi.com\" height=\"36\">\n",
         "</a>\n",
         "\n",
@@ -53,9 +53,10 @@
         "!git clone https://github.com/jhj0517/Whisper-WebUI.git\n",
         "%cd Whisper-WebUI\n",
         "!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n",
-        "!pip install faster-whisper==1.0.2\n",
-        "!pip install gradio==4.14.0\n",
-        "!pip install pytube\n",
         "!pip install tokenizers==0.19.1\n",
         "!pip install pyannote.audio==3.3.1"
       ]
@@ -70,7 +71,7 @@
         "\n",
         "USERNAME = '' #@param {type: \"string\"}\n",
         "PASSWORD = '' #@param {type: \"string\"}\n",
-        "WHISPER_TYPE = 'faster-whisper' #@param {type: \"string\"}\n",
         "THEME = '' #@param {type: \"string\"}\n",
         "\n",
         "arguments = \"\"\n",

         "\n",
         "If you find this project useful, please consider supporting it:\n",
         "\n",
+        "<a href=\"https://ko-fi.com/jhj0517\" target=\"_blank\">\n",
         "    <img src=\"https://storage.ko-fi.com/cdn/kofi2.png?v=3\" alt=\"Buy Me a Coffee at ko-fi.com\" height=\"36\">\n",
         "</a>\n",
         "\n",
         "!git clone https://github.com/jhj0517/Whisper-WebUI.git\n",
         "%cd Whisper-WebUI\n",
         "!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n",
+        "!pip install faster-whisper==1.0.3\n",
+        "!pip install gradio==4.29.0\n",
+        "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/220\n",
+        "!pip install pytubefix\n",
         "!pip install tokenizers==0.19.1\n",
         "!pip install pyannote.audio==3.3.1"
       ]
         "\n",
         "USERNAME = '' #@param {type: \"string\"}\n",
         "PASSWORD = '' #@param {type: \"string\"}\n",
+        "WHISPER_TYPE = 'faster-whisper' # @param [\"whisper\", \"faster-whisper\", \"insanely-fast-whisper\"]\n",
         "THEME = '' #@param {type: \"string\"}\n",
         "\n",
         "arguments = \"\"\n",

requirements.txt CHANGED Viewed

@@ -1,8 +1,13 @@
 --extra-index-url https://download.pytorch.org/whl/cu121
 torch
 git+https://github.com/jhj0517/jhj0517-whisper.git
-faster-whisper==1.0.2
-transformers
-pytube
-gradio
 pyannote.audio==3.3.1

+# Remove the --extra-index-url line below if you're not using Nvidia GPU.
+# If you're using it, update url to your CUDA version (CUDA 12.1 is minimum requirement):
+# For CUDA 12.1, use : https://download.pytorch.org/whl/cu121
+# For CUDA 12.4, use : https://download.pytorch.org/whl/cu124
 --extra-index-url https://download.pytorch.org/whl/cu121
 torch
 git+https://github.com/jhj0517/jhj0517-whisper.git
+faster-whisper==1.0.3
+transformers==4.42.3
+gradio==4.29.0
+pytubefix
 pyannote.audio==3.3.1

start-webui.bat CHANGED Viewed

@@ -1,7 +1,7 @@
 @echo off
 call venv\scripts\activate
-python app.py
 echo "launching the app"
 pause

 @echo off
 call venv\scripts\activate
+python app.py %*
 echo "launching the app"
 pause