File size: 2,553 Bytes
d556da4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import streamlit as st
import torch
import librosa
import soundfile
import nemo.collections.asr as nemo_asr
import tempfile
import os
import uuid
from pydub import AudioSegment
import numpy as np
import io

SAMPLE_RATE = 16000

# Load pre-trained model
model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("stt_en_conformer_transducer_large")
model.change_decoding_strategy(None)
model.eval()


def process_audio_data(audio_data):
    # Convert stereo to mono
    if audio_data.channels == 2:
        audio_data = audio_data.set_channels(1)

    # Convert pydub audio segment to numpy array
    audio_np = np.array(audio_data.get_array_of_samples())

    # Resample if necessary
    if audio_data.frame_rate != SAMPLE_RATE:
        audio_np = librosa.resample(audio_np, audio_data.frame_rate, SAMPLE_RATE)

    return audio_np


def transcribe(audio_np):
    with tempfile.TemporaryDirectory() as tmpdir:
        # Save audio data to a temporary WAV file
        audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
        soundfile.write(audio_path, audio_np, SAMPLE_RATE)

        # Transcribe audio
        transcriptions = model.transcribe([audio_path])

        # Extract best hypothesis if transcriptions form a tuple (from RNNT)
        if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
            transcriptions = transcriptions[0]

    return transcriptions[0]


st.title("Speech Recognition with NeMo Conformer Transducer Large - English")

# Record audio
st.write("Click the button below to start recording.")
record_state = st.checkbox("Recording")

if record_state:
    # Start recording audio
    recording = st.audio("", format="audio/wav")

    # Stop recording when checkbox is unchecked
    recording_file = tempfile.NamedTemporaryFile(delete=False)
    with recording_file as f:
        while record_state:
            audio_data = st.audio_recorder(
                sample_rate=SAMPLE_RATE,
                format="wav",
                data_format="audio/wav"
            )
            f.write(audio_data.getvalue())

            # Update recording display
            audio_data = AudioSegment.from_wav(io.BytesIO(audio_data.getvalue()))
            recording.audio(audio_data, format="audio/wav")
            record_state = st.checkbox("Recording")

    # Process and transcribe recorded audio
    recording_file.seek(0)
    audio_np = process_audio_data(AudioSegment.from_file(recording_file.name))
    transcript = transcribe(audio_np)

    st.write("Transcription:")
    st.write(transcript)