Spaces:

LAP-DEV
/

Demo

Running

App Files Files Community

LAP-DEV commited on Feb 27

Commit

b7531d2

verified ·

1 Parent(s): a4d8f97

Update modules/whisper/faster_whisper_inference.py

Browse files

Files changed (1) hide show

modules/whisper/faster_whisper_inference.py +40 -33

modules/whisper/faster_whisper_inference.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import time
 import numpy as np
 import torch
 from typing import BinaryIO, Union, Tuple, List
@@ -12,11 +13,11 @@ import gradio as gr
 from argparse import Namespace
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
-from modules.whisper.whisper_parameter import *
-from modules.whisper.whisper_base import WhisperBase
-class FasterWhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str = FASTER_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
@@ -35,14 +36,12 @@ class FasterWhisperInference(WhisperBase):
         self.model_paths = self.get_model_paths()
         self.device = self.get_device()
         self.available_models = self.model_paths.keys()
-        self.available_compute_types = ctranslate2.get_supported_compute_types(
-            "cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
                    progress: gr.Progress = gr.Progress(),
                    *whisper_params,
-                   ) -> Tuple[List[dict], float]:
         """
         transcribe method for faster-whisper.
@@ -57,32 +56,22 @@ class FasterWhisperInference(WhisperBase):
         Returns
         ----------
-        segments_result: List[dict]
-            list of dicts that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for transcription
         """
         start_time = time.time()
-        params = WhisperParameters.as_value(*whisper_params)
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)
-        # None parameters with Textboxes: https://github.com/gradio-app/gradio/issues/8723
-        if not params.initial_prompt:
-            params.initial_prompt = None
-        if not params.prefix:
-            params.prefix = None
-        if not params.hotwords:
-            params.hotwords = None
-        params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
         segments, info = self.model.transcribe(
             audio=audio,
             language=params.lang,
-            task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
             beam_size=params.beam_size,
             log_prob_threshold=params.log_prob_threshold,
             no_speech_threshold=params.no_speech_threshold,
@@ -109,16 +98,12 @@ class FasterWhisperInference(WhisperBase):
             language_detection_segments=params.language_detection_segments,
             prompt_reset_on_temperature=params.prompt_reset_on_temperature,
         )
-        progress(0, desc="Loading audio...")
         segments_result = []
         for segment in segments:
-            progress(segment.start / info.duration, desc="Transcribing...")
-            segments_result.append({
-                "start": segment.start,
-                "end": segment.end,
-                "text": segment.text
-            })
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
@@ -134,21 +119,43 @@ class FasterWhisperInference(WhisperBase):
         Parameters
         ----------
         model_size: str
-            Size of whisper model
         compute_type: str
             Compute type for transcription.
             see more info : https://opennmt.net/CTranslate2/quantization.html
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         """
-        progress(0, desc="Initializing Model...")
-        self.current_model_size = self.model_paths[model_size]
         self.current_compute_type = compute_type
         self.model = faster_whisper.WhisperModel(
             device=self.device,
             model_size_or_path=self.current_model_size,
             download_root=self.model_dir,
-            compute_type=self.current_compute_type
         )
     def get_model_paths(self):
@@ -163,7 +170,7 @@ class FasterWhisperInference(WhisperBase):
         faster_whisper_prefix = "models--Systran--faster-whisper-"
         existing_models = os.listdir(self.model_dir)
-        wrong_dirs = [".locks"]
         existing_models = list(set(existing_models) - set(wrong_dirs))
         for model_name in existing_models:
@@ -189,4 +196,4 @@ class FasterWhisperInference(WhisperBase):
                 raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
             return suppress_tokens
         except Exception as e:
-            raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")

 import os
 import time
+import huggingface_hub
 import numpy as np
 import torch
 from typing import BinaryIO, Union, Tuple, List
 from argparse import Namespace
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
+from modules.whisper.data_classes import *
+from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
+class FasterWhisperInference(BaseTranscriptionPipeline):
     def __init__(self,
                  model_dir: str = FASTER_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
         self.model_paths = self.get_model_paths()
         self.device = self.get_device()
         self.available_models = self.model_paths.keys()
     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
                    progress: gr.Progress = gr.Progress(),
                    *whisper_params,
+                   ) -> Tuple[List[Segment], float]:
         """
         transcribe method for faster-whisper.
         Returns
         ----------
+        segments_result: List[Segment]
+            list of Segment that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for transcription
         """
         start_time = time.time()
+        params = WhisperParams.from_list(list(whisper_params))
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)
         segments, info = self.model.transcribe(
             audio=audio,
             language=params.lang,
+            task="translate" if params.is_translate else "transcribe",
             beam_size=params.beam_size,
             log_prob_threshold=params.log_prob_threshold,
             no_speech_threshold=params.no_speech_threshold,
             language_detection_segments=params.language_detection_segments,
             prompt_reset_on_temperature=params.prompt_reset_on_temperature,
         )
+        progress(0, desc="Loading audio..")
         segments_result = []
         for segment in segments:
+            progress(segment.start / info.duration, desc="Transcribing..")
+            segments_result.append(Segment.from_faster_whisper(segment))
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
         Parameters
         ----------
         model_size: str
+            Size of whisper model. If you enter the huggingface repo id, it will try to download the model
+            automatically from huggingface.
         compute_type: str
             Compute type for transcription.
             see more info : https://opennmt.net/CTranslate2/quantization.html
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         """
+        progress(0, desc="Initializing Model..")
+        model_size_dirname = model_size.replace("/", "--") if "/" in model_size else model_size
+        if model_size not in self.model_paths and model_size_dirname not in self.model_paths:
+            print(f"Model is not detected. Trying to download \"{model_size}\" from huggingface to "
+                  f"\"{os.path.join(self.model_dir, model_size_dirname)} ...")
+            huggingface_hub.snapshot_download(
+                model_size,
+                local_dir=os.path.join(self.model_dir, model_size_dirname),
+            )
+            self.model_paths = self.get_model_paths()
+            gr.Info(f"Model is downloaded with the name \"{model_size_dirname}\"")
+        self.current_model_size = self.model_paths[model_size_dirname]
+        local_files_only = False
+        hf_prefix = "models--Systran--faster-whisper-"
+        official_model_path = os.path.join(self.model_dir, hf_prefix+model_size)
+        if ((os.path.isdir(self.current_model_size) and os.path.exists(self.current_model_size)) or
+            (model_size in faster_whisper.available_models() and os.path.exists(official_model_path))):
+            local_files_only = True
         self.current_compute_type = compute_type
         self.model = faster_whisper.WhisperModel(
             device=self.device,
             model_size_or_path=self.current_model_size,
             download_root=self.model_dir,
+            compute_type=self.current_compute_type,
+            local_files_only=local_files_only
         )
     def get_model_paths(self):
         faster_whisper_prefix = "models--Systran--faster-whisper-"
         existing_models = os.listdir(self.model_dir)
+        wrong_dirs = [".locks", "faster_whisper_models_will_be_saved_here"]
         existing_models = list(set(existing_models) - set(wrong_dirs))
         for model_name in existing_models:
                 raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
             return suppress_tokens
         except Exception as e:
+            raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")