File size: 2,223 Bytes
0e6eb34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline


class Whisper:
    """Whisper - audio transcriber class"""

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    def __init__(self, model_id: str = "openai/whisper-base") -> None:
        self.model_id = model_id
        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
            model_id,
            torch_dtype=self.torch_dtype,
            low_cpu_mem_usage=True,
            use_safetensors=True,
        )
        self.model.to(self.device)
        self.processor = AutoProcessor.from_pretrained(model_id)

    @property
    def model_name(self):
        """
        Getter method for retrieving the model name.
        """
        return self.model_id

    def save(self, save_dir: str):
        """
        Saves the model and processor to the specified directory.

        Args:
            save_dir (str): The directory where the model and processor will be saved.
        """
        self.model.save_pretrained(f"{save_dir}/model")
        self.processor.save_pretrained(f"{save_dir}/processor")

    def load(self, load_dir: str):
        """
        Load the model and processor from the specified directory.

        Args:
            load_dir (str): The directory from which to load the model and processor.
        """
        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(f"{load_dir}/model")
        self.processor = AutoProcessor.from_pretrained(f"{load_dir}/processor")

        self.model.to(self.device)

    def pipeline(self):
        pipe = pipeline(
            "automatic-speech-recognition",
            model=self.model,
            tokenizer=self.processor.tokenizer,
            feature_extractor=self.processor.feature_extractor,
            max_new_tokens=128,
            chunk_length_s=15,
            batch_size=16,
            return_timestamps=True,
            torch_dtype=self.torch_dtype,
            device=self.device,
        )

        return pipe


def transcribe_audio(file):
    whisper = Whisper()
    pipe = whisper.pipeline()

    result = pipe(file)
    return result