vakyansh commited on
Commit
d556da4
·
verified ·
1 Parent(s): 3e0225d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import librosa
4
+ import soundfile
5
+ import nemo.collections.asr as nemo_asr
6
+ import tempfile
7
+ import os
8
+ import uuid
9
+ from pydub import AudioSegment
10
+ import numpy as np
11
+ import io
12
+
13
+ SAMPLE_RATE = 16000
14
+
15
+ # Load pre-trained model
16
+ model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("stt_en_conformer_transducer_large")
17
+ model.change_decoding_strategy(None)
18
+ model.eval()
19
+
20
+
21
+ def process_audio_data(audio_data):
22
+ # Convert stereo to mono
23
+ if audio_data.channels == 2:
24
+ audio_data = audio_data.set_channels(1)
25
+
26
+ # Convert pydub audio segment to numpy array
27
+ audio_np = np.array(audio_data.get_array_of_samples())
28
+
29
+ # Resample if necessary
30
+ if audio_data.frame_rate != SAMPLE_RATE:
31
+ audio_np = librosa.resample(audio_np, audio_data.frame_rate, SAMPLE_RATE)
32
+
33
+ return audio_np
34
+
35
+
36
+ def transcribe(audio_np):
37
+ with tempfile.TemporaryDirectory() as tmpdir:
38
+ # Save audio data to a temporary WAV file
39
+ audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
40
+ soundfile.write(audio_path, audio_np, SAMPLE_RATE)
41
+
42
+ # Transcribe audio
43
+ transcriptions = model.transcribe([audio_path])
44
+
45
+ # Extract best hypothesis if transcriptions form a tuple (from RNNT)
46
+ if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
47
+ transcriptions = transcriptions[0]
48
+
49
+ return transcriptions[0]
50
+
51
+
52
+ st.title("Speech Recognition with NeMo Conformer Transducer Large - English")
53
+
54
+ # Record audio
55
+ st.write("Click the button below to start recording.")
56
+ record_state = st.checkbox("Recording")
57
+
58
+ if record_state:
59
+ # Start recording audio
60
+ recording = st.audio("", format="audio/wav")
61
+
62
+ # Stop recording when checkbox is unchecked
63
+ recording_file = tempfile.NamedTemporaryFile(delete=False)
64
+ with recording_file as f:
65
+ while record_state:
66
+ audio_data = st.audio_recorder(
67
+ sample_rate=SAMPLE_RATE,
68
+ format="wav",
69
+ data_format="audio/wav"
70
+ )
71
+ f.write(audio_data.getvalue())
72
+
73
+ # Update recording display
74
+ audio_data = AudioSegment.from_wav(io.BytesIO(audio_data.getvalue()))
75
+ recording.audio(audio_data, format="audio/wav")
76
+ record_state = st.checkbox("Recording")
77
+
78
+ # Process and transcribe recorded audio
79
+ recording_file.seek(0)
80
+ audio_np = process_audio_data(AudioSegment.from_file(recording_file.name))
81
+ transcript = transcribe(audio_np)
82
+
83
+ st.write("Transcription:")
84
+ st.write(transcript)