File size: 839 Bytes
f4c725a
 
c8672f7
7f74ea0
f4c725a
 
 
 
 
 
 
 
c8672f7
f4c725a
 
 
7f74ea0
 
 
 
 
 
 
 
 
f4c725a
 
 
 
 
 
5ece751
f4c725a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from transformers import pipeline
from accelerate import Accelerator
import spaces
import librosa

model_id = "JacobLinCool/whisper-large-v3-turbo-common_voice_19_0-zh-TW"

pipe = None


def load_model():
    global pipe
    device = Accelerator().device
    pipe = pipeline("automatic-speech-recognition", model=model_id, device=device)


def get_gpu_duration(audio: str) -> int:
    y, sr = librosa.load(audio)
    duration = librosa.get_duration(y=y, sr=sr) / 60.0
    gpu_duration = (duration + 59.0) // 60.0
    print(f"{duration=}, {gpu_duration=}")
    return max(1, int(gpu_duration))


@spaces.GPU(duration=get_gpu_duration)
def transcribe_audio_local(audio: str) -> str:
    print(f"{audio=}")

    if pipe is None:
        load_model()

    out = pipe(audio, return_timestamps=True)
    print(f"{out=}")

    return out["text"]