File size: 1,245 Bytes
e4b911e 67a2095 e4b911e 8235e54 41061e4 e4b911e 41061e4 6452c1e 0619e36 e4b911e 41061e4 e4b911e 41061e4 109fb39 e4b911e 0619e36 60a5b62 e4b911e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
import torch
from typing import Dict
from transformers import pipeline
from datasets import load_dataset
from transformers.pipelines.audio_utils import ffmpeg_read
SAMPLE_RATE=16000
class EndpointHandler():
def __init__(self, path=""):
device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.pipe = pipeline(
"automatic-speech-recognition",
model="openai/whisper-large",
chunk_length_s=30,
device=device,
)
def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
#ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
#sample = ds[0]["audio"]
inputs = data.pop("inputs", data)
audio_nparray = ffmpeg_read(inputs, 16000)
audio_tensor = torch.from_numpy(audio_nparray)
prediction = self.pipe(audio_nparray, return_timestamps=True)
return {"text": prediction[0]}
# we can also return timestamps for the predictions
#prediction = pipe(sample, return_timestamps=True)["chunks"]
#[{'text': ' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.',
# 'timestamp': (0.0, 5.44)}] |