import tempfile import bentoml import ffmpeg import numpy as np import torch from faster_whisper import WhisperModel from transformers import pipeline class AudioTranscriber(bentoml.Runnable): SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu") SUPPORTS_CPU_MULTI_THREADING = True SAMPLE_RATE = 16000 def __init__(self): self.faster_model = WhisperModel("base") self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # self.model.to(self.device) @bentoml.Runnable.method(batchable=False) def transcribe_audio_faster(self, temp_file_path): segments, info = self.faster_model.transcribe(temp_file_path) transcription = [] segment_info = [] for segment in segments: print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) transcription.append(segment.text) segment_info.append((segment.start, segment.end)) return transcription, info, segment_info @bentoml.Runnable.method(batchable=False) def transcribe_audio(self, file): with tempfile.NamedTemporaryFile(delete=False) as temp_file: temp_file.write(file.read()) temp_file_path = temp_file.name # Use the temporary file path as input for ffmpeg.input() try: # This launches a subprocess to decode audio while down-mixing and resampling as necessary. # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. out, _ = ( ffmpeg.input(temp_file_path, threads=0) .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=self.SAMPLE_RATE) .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True) ) except ffmpeg.Error as e: raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") input_features = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 print("Input_features", type(input_features)) pipe = pipeline("automatic-speech-recognition", model=self.model, tokenizer=self.tokenizer, feature_extractor=self.extractor, device=self.device) result = self.get_long_transcription_whisper(input_features, pipe) return result @staticmethod def get_long_transcription_whisper(input_features, pipe, return_timestamps=True, chunk_length_s=10, stride_length_s=2): """Get the transcription of a long audio file using the Whisper model input_feature: numpy.ndarray """ return pipe(input_features, return_timestamps=return_timestamps, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s)