|
from typing import Dict |
|
from transformers.pipelines.audio_utils import ffmpeg_read |
|
import whisper |
|
import torch |
|
|
|
SAMPLE_RATE = 16000 |
|
|
|
MODEL_NAME = "openai/whisper-large" |
|
lang = "dk" |
|
|
|
|
|
|
|
|
|
class EndpointHandler(): |
|
def __init__(self, path=""): |
|
pipe = pipeline( |
|
task="automatic-speech-recognition", |
|
model=MODEL_NAME, |
|
chunk_length_s=30, |
|
device=device, |
|
) |
|
|
|
|
|
|
|
self.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe") |
|
|
|
|
|
def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]: |
|
""" |
|
Args: |
|
data (:obj:): |
|
includes the deserialized audio file as bytes |
|
Return: |
|
A :obj:`dict`:. base64 encoded image |
|
""" |
|
|
|
inputs = data.pop("inputs", data) |
|
audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE) |
|
audio_tensor= torch.from_numpy(audio_nparray) |
|
|
|
|
|
result = self.model.transcribe(audio_nparray) |
|
|
|
|
|
|
|
return {"tekst": result["text"]} |