import gradio as gr import pyaudioconvert as pac from pydub import AudioSegment import nemo import nemo.collections.asr as nemo_asr # Load the pre-trained model model = nemo_asr.models.EncDecCTCModelBPE.restore_from( restore_path="/home/yonas/stt/demo/model/Kinyarwanda_nemo_stt_conformer_model.nemo" ) assert isinstance(model, nemo.collections.asr.models.EncDecCTCModel) def convert(file_name): if file_name.endswith(("mp3", "wav", "ogg")): if file_name.endswith("mp3"): sound = AudioSegment.from_mp3(file_name) sound.export(file_name, format="wav") elif file_name.endswith("ogg"): sound = AudioSegment.from_ogg(file_name) sound.export(file_name, format="wav") else: return False pac.convert_wav_to_16bit_mono(file_name, file_name) return True def transcribe(audio): if not audio: return "No audio provided" if not convert(audio): return "The format must be mp3, wav, or ogg" result = model.transcribe([audio]) return result[0] gradio_ui = gr.Interface( fn=transcribe, title="Kinyarwanda Speech Recognition", description="Upload an audio clip or record from browser using microphone.", inputs=[ gr.Audio(label="Upload Audio File or Record from microphone", sources=["upload", "microphone"], type="filepath", format="wav"), ], outputs=gr.Text(label="Recognized speech") ) # Launch the Gradio app gradio_ui.launch(share=True, debug=True)