test-streamlit / app.py
vakyansh's picture
Create app.py
d556da4 verified
import streamlit as st
import torch
import librosa
import soundfile
import nemo.collections.asr as nemo_asr
import tempfile
import os
import uuid
from pydub import AudioSegment
import numpy as np
import io
SAMPLE_RATE = 16000
# Load pre-trained model
model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("stt_en_conformer_transducer_large")
model.change_decoding_strategy(None)
model.eval()
def process_audio_data(audio_data):
# Convert stereo to mono
if audio_data.channels == 2:
audio_data = audio_data.set_channels(1)
# Convert pydub audio segment to numpy array
audio_np = np.array(audio_data.get_array_of_samples())
# Resample if necessary
if audio_data.frame_rate != SAMPLE_RATE:
audio_np = librosa.resample(audio_np, audio_data.frame_rate, SAMPLE_RATE)
return audio_np
def transcribe(audio_np):
with tempfile.TemporaryDirectory() as tmpdir:
# Save audio data to a temporary WAV file
audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
soundfile.write(audio_path, audio_np, SAMPLE_RATE)
# Transcribe audio
transcriptions = model.transcribe([audio_path])
# Extract best hypothesis if transcriptions form a tuple (from RNNT)
if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
transcriptions = transcriptions[0]
return transcriptions[0]
st.title("Speech Recognition with NeMo Conformer Transducer Large - English")
# Record audio
st.write("Click the button below to start recording.")
record_state = st.checkbox("Recording")
if record_state:
# Start recording audio
recording = st.audio("", format="audio/wav")
# Stop recording when checkbox is unchecked
recording_file = tempfile.NamedTemporaryFile(delete=False)
with recording_file as f:
while record_state:
audio_data = st.audio_recorder(
sample_rate=SAMPLE_RATE,
format="wav",
data_format="audio/wav"
)
f.write(audio_data.getvalue())
# Update recording display
audio_data = AudioSegment.from_wav(io.BytesIO(audio_data.getvalue()))
recording.audio(audio_data, format="audio/wav")
record_state = st.checkbox("Recording")
# Process and transcribe recorded audio
recording_file.seek(0)
audio_np = process_audio_data(AudioSegment.from_file(recording_file.name))
transcript = transcribe(audio_np)
st.write("Transcription:")
st.write(transcript)