Spaces:
Runtime error
Runtime error
File size: 2,619 Bytes
3119dd6 da923e2 3119dd6 8d72534 3119dd6 1416675 3119dd6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import gradio as gr
import librosa
import soundfile as sf
import torch
import warnings
import os
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model
from engine import SpeechToTextEngine
import wave
from nemo_asr import transcribe
warnings.filterwarnings("ignore")
from speechbrain.pretrained import EncoderDecoderASR
asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw")
#asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3")
# define speech-to-text function
def asr_transcript(audio):
if audio == None:
return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
text = ""
data={}
if audio:
text_asr = asr_model.transcribe_file(audio.name)
text_nemo_trasducer = transcribe(audio, "stt_rw_conformer_transducer_large")
with open(audio.name,'rb') as f:
audio_proper = f.read()
stt_engine = SpeechToTextEngine()
all_hot_words = []
if data:
all_hot_words = stt_engine.add_hot_words(data)
if not audio_proper:
raise InvalidUsage('Audio not provided')
# Running the transcription
text_coqui = stt_engine.run(audio_proper)
return text_asr.lower() , text_coqui , text_nemo_trasducer
else:
return "File not valid"
gradio_ui = gr.Interface(
fn=asr_transcript,
title="Kinyarwanda Speech Recognition",
description="Record an audio clip from browser using microphone, and let AI do the hard work of transcribing.",
article = """
This demo showcases two pretrained STT models the first model from speechbrain(wave2vec+CTC models)(1,2gb) is 30 times larger compared to the coqui STT (deepspeech model)(45mb).
""",
inputs=[gr.inputs.Audio(label="Upload Audio File", type="file", optional=True), gr.inputs.Audio(source="microphone", type="file", optional=False, label="Record from microphone")],
outputs=[gr.outputs.Textbox(label="Recognized speech from speechbrain model"),
gr.outputs.Textbox(label="Recognized speech from coqui STT model"),
gr.outputs.Textbox(label="Recognized speech from NVIDIA Conformer transduver large model")]
# examples = [["sample_1.wav"],["sample_2.wav"]]
)
gradio_ui.launch(enable_queue=True) |