Kinyarwanda-asr / app.py
rutsam's picture
use upload audio instead
a39f293
raw
history blame
2.51 kB
import gradio as gr
import librosa
import soundfile as sf
import torch
import warnings
import os
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model
from engine import SpeechToTextEngine
import wave
from nemo_asr import transcribe
warnings.filterwarnings("ignore")
from speechbrain.pretrained import EncoderDecoderASR
asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw")
#asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3")
# define speech-to-text function
def asr_transcript(audio):
if audio == None:
return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
text = ""
data={}
if audio:
text_asr = asr_model.transcribe_file(audio.name)
text_nemo_trasducer = transcribe(audio, "stt_rw_conformer_ctc_large")
with open(audio.name,'rb') as f:
audio_proper = f.read()
stt_engine = SpeechToTextEngine()
all_hot_words = []
if data:
all_hot_words = stt_engine.add_hot_words(data)
if not audio_proper:
raise InvalidUsage('Audio not provided')
# Running the transcription
text_coqui = stt_engine.run(audio_proper)
return text_asr.lower() , text_coqui , text_nemo_trasducer
else:
return "File not valid"
gradio_ui = gr.Interface(
fn=asr_transcript,
title="Kinyarwanda Speech Recognition",
description="Record an audio clip from browser using microphone, and let AI do the hard work of transcribing.",
article = """
This demo showcases two pretrained STT models the first model from speechbrain(wave2vec+CTC models)(1,2gb) is 30 times larger compared to the coqui STT (deepspeech model)(45mb).
""",
inputs=[ gr.inputs.Audio(label="Upload Audio File", type="file", optional=False)],
outputs=[gr.outputs.Textbox(label="Recognized speech from speechbrain model"),
gr.outputs.Textbox(label="Recognized speech from coqui STT model"),
gr.outputs.Textbox(label="Recognized speech from NVIDIA conformer ctc large model")]
# examples = [["sample_1.wav"],["sample_2.wav"]]
)
gradio_ui.launch(enable_queue=True)