Spaces:
Sleeping
Sleeping
File size: 2,807 Bytes
d8d26b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import tempfile
import bentoml
import ffmpeg
import numpy as np
import torch
from faster_whisper import WhisperModel
from transformers import pipeline
class AudioTranscriber(bentoml.Runnable):
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
SUPPORTS_CPU_MULTI_THREADING = True
SAMPLE_RATE = 16000
def __init__(self):
self.faster_model = WhisperModel("base")
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# self.model.to(self.device)
@bentoml.Runnable.method(batchable=False)
def transcribe_audio_faster(self, temp_file_path):
segments, info = self.faster_model.transcribe(temp_file_path)
transcription = []
segment_info = []
for segment in segments:
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
transcription.append(segment.text)
segment_info.append((segment.start, segment.end))
return transcription, info, segment_info
@bentoml.Runnable.method(batchable=False)
def transcribe_audio(self, file):
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(file.read())
temp_file_path = temp_file.name
# Use the temporary file path as input for ffmpeg.input()
try:
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
out, _ = (
ffmpeg.input(temp_file_path, threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=self.SAMPLE_RATE)
.run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}")
input_features = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
print("Input_features", type(input_features))
pipe = pipeline("automatic-speech-recognition",
model=self.model, tokenizer=self.tokenizer, feature_extractor=self.extractor,
device=self.device)
result = self.get_long_transcription_whisper(input_features, pipe)
return result
@staticmethod
def get_long_transcription_whisper(input_features, pipe, return_timestamps=True,
chunk_length_s=10, stride_length_s=2):
"""Get the transcription of a long audio file using the Whisper model
input_feature: numpy.ndarray
"""
return pipe(input_features, return_timestamps=return_timestamps,
chunk_length_s=chunk_length_s, stride_length_s=stride_length_s) |