|
import base64 |
|
import io |
|
import logging |
|
|
|
from faster_whisper import WhisperModel |
|
|
|
from flie_processor import process_video |
|
|
|
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
|
|
class EndpointHandler: |
|
def __init__(self, path=""): |
|
self.model = WhisperModel("large-v2", num_workers=30) |
|
|
|
def __call__(self, data: dict[str, str]): |
|
inputs = data.pop("inputs") |
|
link = data.pop("link") |
|
|
|
language = data.pop("language", "de") |
|
task = data.pop("task", "transcribe") |
|
processing_type = data.pop("type", "audio") |
|
|
|
response = {} |
|
|
|
if processing_type == "link": |
|
slides, audio_bytes = process_video(link) |
|
slides_list = [slide.to_dict() for slide in slides] |
|
response.update({"slides": slides_list}) |
|
else: |
|
audio_bytes_decoded = base64.b64decode(inputs) |
|
logging.debug(f"Decoded Bytes Length: {len(audio_bytes_decoded)}") |
|
audio_bytes = io.BytesIO(audio_bytes_decoded) |
|
|
|
|
|
logging.info("Running inference...") |
|
segments, info = self.model.transcribe(audio_bytes, language=language, task=task) |
|
|
|
full_text = [] |
|
for segment in segments: |
|
full_text.append({"segmentId": segment.id, |
|
"text": segment.text, |
|
"timestamps": { |
|
"start": segment.start, |
|
"end": segment.end |
|
} |
|
}) |
|
|
|
if segment.id % 100 == 0: |
|
logging.info("segment " + str(segment.id) + " transcribed") |
|
logging.info("Inference completed.") |
|
|
|
response.update({"audios": full_text}) |
|
logging.debug(response) |
|
return response |
|
|