File size: 1,564 Bytes
c1701de
e9de351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1701de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9de351
 
c1701de
 
 
 
 
e9de351
c1701de
 
 
 
e9de351
 
 
 
 
c1701de
 
 
 
 
 
 
 
 
e9de351
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from transformers import pipeline
import gradio as gr
import librosa
import torch


if torch.cuda.is_available():
    device = torch.device("cuda")
elif (
    hasattr(torch.backends, "mps")
    and torch.backends.mps.is_available()
    and torch.backends.mps.is_built()
):
    device = torch.device("mps")
else:
    device = torch.device("cpu")


pipe1 = pipeline(
    "automatic-speech-recognition",
    model="base",
    tokenizer="openai/whisper-base",
    chunk_length_s=26,
    device=device,
    stride_length_s=(4, 2),
)
pipe2 = pipeline(
    "automatic-speech-recognition",
    model="tiny",
    tokenizer="openai/whisper-tiny",
    chunk_length_s=26,
    device=device,
    stride_length_s=(4, 2),
)


def transcribe(audio, x, model):
    if audio == None:
        sample = librosa.load(x, sr=16_000, mono=True)[0]
    else:
        sample = librosa.load(audio, sr=16_000, mono=True)[0]

    if model == "base":
        transcription_whspr = pipe1(sample, batch_size=8)["text"]
    elif model == "tiny":
        transcription_whspr = pipe2(sample, batch_size=8)["text"]
    return transcription_whspr


iface = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(source="microphone", type="filepath"),
        gr.Audio(source="upload", type="filepath"),
        gr.Dropdown(
            choices=["base", "tiny"],
            info="model k wuzwolenju",
            value="base",
        ),
    ],
    outputs="text",
    title="Serbski STT",
    description="Gradio demo za spóznawanje rěće w hornjoserbšćinje",
)

iface.launch(debug=True)