from transformers import pipeline import gradio as gr import librosa import torch if torch.cuda.is_available(): device = torch.device("cuda") elif ( hasattr(torch.backends, "mps") and torch.backends.mps.is_available() and torch.backends.mps.is_built() ): device = torch.device("mps") else: device = torch.device("cpu") pipe1 = pipeline( "automatic-speech-recognition", model="base", tokenizer="openai/whisper-base", chunk_length_s=26, device=device, stride_length_s=(4, 2), ) pipe2 = pipeline( "automatic-speech-recognition", model="tiny", tokenizer="openai/whisper-tiny", chunk_length_s=26, device=device, stride_length_s=(4, 2), ) def transcribe(audio, x, model): if audio == None: sample = librosa.load(x, sr=16_000, mono=True)[0] else: sample = librosa.load(audio, sr=16_000, mono=True)[0] if model == "base": transcription_whspr = pipe1(sample, batch_size=8)["text"] elif model == "tiny": transcription_whspr = pipe2(sample, batch_size=8)["text"] return transcription_whspr iface = gr.Interface( fn=transcribe, inputs=[ gr.Audio(source="microphone", type="filepath"), gr.Audio(source="upload", type="filepath"), gr.Dropdown( choices=["base", "tiny"], info="model k wuzwolenju", value="base", ), ], outputs="text", title="Serbski STT", description="Gradio demo za spóznawanje rěće w hornjoserbšćinje", ) iface.launch(debug=True)