File size: 1,858 Bytes
91251fa
96d549d
91251fa
 
96d549d
 
 
 
 
91251fa
 
 
 
 
 
 
96d549d
 
 
91251fa
 
96d549d
 
 
91251fa
96d549d
 
 
 
 
 
 
 
91251fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96d549d
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import base64
import io
import logging

from faster_whisper import WhisperModel

from flie_processor import process_video

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')


class EndpointHandler:
    def __init__(self, path=""):
        self.model = WhisperModel("large-v2", num_workers=30)

    def __call__(self, data: dict[str, str]):
        inputs = data.pop("inputs")
        link = data.pop("link")

        language = data.pop("language", "de")
        task = data.pop("task", "transcribe")
        processing_type = data.pop("type", "audio")

        response = {}

        if processing_type == "link":
            slides, audio_bytes = process_video(link)
            slides_list = [slide.to_dict() for slide in slides]
            response.update({"slides": slides_list})
        else:
            audio_bytes_decoded = base64.b64decode(inputs)
            logging.debug(f"Decoded Bytes Length: {len(audio_bytes_decoded)}")
            audio_bytes = io.BytesIO(audio_bytes_decoded)

        # run inference pipeline
        logging.info("Running inference...")
        segments, info = self.model.transcribe(audio_bytes, language=language, task=task)

        full_text = []
        for segment in segments:
            full_text.append({"segmentId": segment.id,
                              "text": segment.text,
                              "timestamps": {
                                  "start": segment.start,
                                  "end": segment.end
                              }
                              })

            if segment.id % 100 == 0:
                logging.info("segment " + str(segment.id) + " transcribed")
        logging.info("Inference completed.")

        response.update({"audios": full_text})
        logging.debug(response)
        return response