Spaces:
Sleeping
Sleeping
import tempfile | |
import bentoml | |
import ffmpeg | |
import numpy as np | |
import torch | |
from faster_whisper import WhisperModel | |
from transformers import pipeline | |
class AudioTranscriber(bentoml.Runnable): | |
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu") | |
SUPPORTS_CPU_MULTI_THREADING = True | |
SAMPLE_RATE = 16000 | |
def __init__(self): | |
self.faster_model = WhisperModel("base") | |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# self.model.to(self.device) | |
def transcribe_audio_faster(self, temp_file_path): | |
segments, info = self.faster_model.transcribe(temp_file_path) | |
transcription = [] | |
segment_info = [] | |
for segment in segments: | |
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) | |
transcription.append(segment.text) | |
segment_info.append((segment.start, segment.end)) | |
return transcription, info, segment_info | |
def transcribe_audio(self, file): | |
with tempfile.NamedTemporaryFile(delete=False) as temp_file: | |
temp_file.write(file.read()) | |
temp_file_path = temp_file.name | |
# Use the temporary file path as input for ffmpeg.input() | |
try: | |
# This launches a subprocess to decode audio while down-mixing and resampling as necessary. | |
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. | |
out, _ = ( | |
ffmpeg.input(temp_file_path, threads=0) | |
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=self.SAMPLE_RATE) | |
.run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True) | |
) | |
except ffmpeg.Error as e: | |
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") | |
input_features = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 | |
print("Input_features", type(input_features)) | |
pipe = pipeline("automatic-speech-recognition", | |
model=self.model, tokenizer=self.tokenizer, feature_extractor=self.extractor, | |
device=self.device) | |
result = self.get_long_transcription_whisper(input_features, pipe) | |
return result | |
def get_long_transcription_whisper(input_features, pipe, return_timestamps=True, | |
chunk_length_s=10, stride_length_s=2): | |
"""Get the transcription of a long audio file using the Whisper model | |
input_feature: numpy.ndarray | |
""" | |
return pipe(input_features, return_timestamps=return_timestamps, | |
chunk_length_s=chunk_length_s, stride_length_s=stride_length_s) |