import base64 import io import logging from faster_whisper import WhisperModel from flie_processor import process_video logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') class EndpointHandler: def __init__(self, path=""): self.model = WhisperModel("large-v2", num_workers=30) def __call__(self, data: dict[str, str]): inputs = data.pop("inputs") link = data.pop("link") language = data.pop("language", "de") task = data.pop("task", "transcribe") processing_type = data.pop("type", "audio") response = {} if processing_type == "link": slides, audio_bytes = process_video(link) slides_list = [slide.to_dict() for slide in slides] response.update({"slides": slides_list}) else: audio_bytes_decoded = base64.b64decode(inputs) logging.debug(f"Decoded Bytes Length: {len(audio_bytes_decoded)}") audio_bytes = io.BytesIO(audio_bytes_decoded) # run inference pipeline logging.info("Running inference...") segments, info = self.model.transcribe(audio_bytes, language=language, task=task) full_text = [] for segment in segments: full_text.append({"segmentId": segment.id, "text": segment.text, "timestamps": { "start": segment.start, "end": segment.end } }) if segment.id % 100 == 0: logging.info("segment " + str(segment.id) + " transcribed") logging.info("Inference completed.") response.update({"audios": full_text}) logging.debug(response) return response