ASR_Demo_Kinyarwanda / gradio_demo.py
yonas's picture
Upload folder using huggingface_hub
a009c84
raw
history blame
1.51 kB
import gradio as gr
import pyaudioconvert as pac
from pydub import AudioSegment
import nemo
import nemo.collections.asr as nemo_asr
# Load the pre-trained model
model = nemo_asr.models.EncDecCTCModelBPE.restore_from(
restore_path="/home/yonas/stt/demo/model/Kinyarwanda_nemo_stt_conformer_model.nemo"
)
assert isinstance(model, nemo.collections.asr.models.EncDecCTCModel)
def convert(file_name):
if file_name.endswith(("mp3", "wav", "ogg")):
if file_name.endswith("mp3"):
sound = AudioSegment.from_mp3(file_name)
sound.export(file_name, format="wav")
elif file_name.endswith("ogg"):
sound = AudioSegment.from_ogg(file_name)
sound.export(file_name, format="wav")
else:
return False
pac.convert_wav_to_16bit_mono(file_name, file_name)
return True
def transcribe(audio):
if not audio:
return "No audio provided"
if not convert(audio):
return "The format must be mp3, wav, or ogg"
result = model.transcribe([audio])
return result[0]
gradio_ui = gr.Interface(
fn=transcribe,
title="Kinyarwanda Speech Recognition",
description="Upload an audio clip or record from browser using microphone.",
inputs=[
gr.Audio(label="Upload Audio File or Record from microphone", sources=["upload", "microphone"], type="filepath", format="wav"),
],
outputs=gr.Text(label="Recognized speech")
)
# Launch the Gradio app
gradio_ui.launch(share=True, debug=True)