|
import torch |
|
from typing import Dict |
|
from transformers import pipeline |
|
from datasets import load_dataset |
|
from transformers.pipelines.audio_utils import ffmpeg_read |
|
|
|
SAMPLE_RATE=16000 |
|
class EndpointHandler(): |
|
def __init__(self, path=""): |
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
self.pipe = pipeline( |
|
"automatic-speech-recognition", |
|
model="openai/whisper-large", |
|
chunk_length_s=30, |
|
device=device, |
|
) |
|
|
|
def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]: |
|
|
|
|
|
inputs = data.pop("inputs", data) |
|
audio_nparray = ffmpeg_read(inputs, 16000) |
|
audio_tensor = torch.from_numpy(audio_nparray) |
|
|
|
prediction = self.pipe(audio_nparray, return_timestamps=True) |
|
return {"text": prediction[0]} |
|
|
|
|
|
|
|
|
|
|