import os import gradio as gr import librosa import torch from transformers import WhisperForConditionalGeneration, WhisperProcessor hf_token = os.getenv("hf_token") if hf_token is None: raise ValueError( "Hugging Face token not found. Please set the 'hf_token' environment variable." ) processor = WhisperProcessor.from_pretrained( "openai/whisper-small", language="Indonesian", task="transcribe", token=hf_token, ) model = WhisperForConditionalGeneration.from_pretrained( "avalonai/whisper-small-jv", token=hf_token ) def transcribe(audio): audio, sampling_rate = librosa.load(audio, sr=16000) audio_input = processor(audio, return_tensors="pt", sampling_rate=16000) input_values = audio_input.input_features with torch.no_grad(): generated_ids = model.generate(input_values) transcription = processor.batch_decode(generated_ids, skip_special_tokens=True) return transcription[0] iface = gr.Interface( fn=transcribe, inputs=gr.Audio(sources="microphone", type="filepath"), outputs="text", title="Speech-to-text on Javanese Language Demo", description="Ini adalah platform untuk pengujian model speech-to-text pada bahasa Jawa oleh Avalon AI. Silahkan coba dengan mengucapkan kalimat", ) iface.launch()