AAhad commited on
Commit
e16e61e
·
1 Parent(s): 3f2970b

added webrtc

Browse files
Files changed (2) hide show
  1. app.py +105 -33
  2. requirements.txt +2 -1
app.py CHANGED
@@ -5,6 +5,10 @@ import numpy as np
5
  import time
6
  from transformers import pipeline
7
  from io import BytesIO
 
 
 
 
8
 
9
  # Define the models (You can replace these with any other top models supporting audio input)
10
  MODELS = {
@@ -25,47 +29,115 @@ language = st.selectbox("Choose Language", options=["English", "Thai"])
25
  # Model selection
26
  model_choice = st.selectbox("Choose a Model", options=list(MODELS.keys()))
27
 
28
- # Record audio
29
- st.subheader("Record your audio")
30
- audio_recorder = st.audio("")
31
 
32
- if st.button("Start Recording"):
33
- # Add code here to handle audio recording via mic or upload if needed
34
- st.warning("Audio recording functionality needs to be implemented")
35
 
36
- # Placeholder for conversion metrics
37
- if audio_recorder:
38
- st.write("Recording audio metrics...")
39
-
40
- # Read audio file
41
- audio_data, sr = librosa.load(audio_recorder, sr=None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- # Compute audio properties
44
- audio_size = len(audio_data) * 2 # in bytes (16-bit PCM)
45
- frame_rate = sr
46
- duration = librosa.get_duration(y=audio_data, sr=sr)
47
 
48
- # Display audio properties
49
- st.write(f"Audio Size: {audio_size} bytes")
50
- st.write(f"Frame Rate: {frame_rate} Hz")
51
- st.write(f"Duration: {duration:.2f} seconds")
52
 
53
- # Perform conversion using the selected model
54
- st.subheader("Converting audio to text...")
 
55
 
56
- start_time = time.time()
 
 
57
 
58
- # Load the model from HuggingFace
59
- model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])
 
 
60
 
61
- # Perform the conversion
62
- audio_bytes = BytesIO(sf.write("temp.wav", audio_data, sr))
63
- result = model(audio_bytes)
64
 
65
- end_time = time.time()
66
 
67
- # Display results
68
- st.write("Transcription:", result['text'])
69
- st.write(f"Conversion took {end_time - start_time:.2f} seconds")
70
 
71
- # Provide placeholder for actual audio recording functionality if necessary.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import time
6
  from transformers import pipeline
7
  from io import BytesIO
8
+ import tempfile
9
+ from streamlit_webrtc import webrtc_streamer, WebRtcMode, ClientSettings
10
+ import av
11
+ import queue
12
 
13
  # Define the models (You can replace these with any other top models supporting audio input)
14
  MODELS = {
 
29
  # Model selection
30
  model_choice = st.selectbox("Choose a Model", options=list(MODELS.keys()))
31
 
32
+ # Audio input options
33
+ st.subheader("Record or Upload your audio")
34
+ audio_option = st.radio("Choose an option:", ('Record Audio', 'Upload Audio'))
35
 
36
+ audio_data = None
 
 
37
 
38
+ # Queue to store recorded audio frames
39
+ audio_queue = queue.Queue()
40
+
41
+ # WebRTC Audio Recorder
42
+ def audio_frame_callback(frame: av.AudioFrame):
43
+ audio = frame.to_ndarray()
44
+ audio_queue.put(audio)
45
+ return frame
46
+
47
+ # Option 1: Record audio via browser using WebRTC
48
+ if audio_option == 'Record Audio':
49
+ st.write("Click the button to start/stop recording.")
50
+
51
+ webrtc_ctx = webrtc_streamer(
52
+ key="audio-stream",
53
+ mode=WebRtcMode.SENDONLY,
54
+ client_settings=ClientSettings(
55
+ media_stream_constraints={
56
+ "audio": True,
57
+ "video": False,
58
+ }
59
+ ),
60
+ audio_frame_callback=audio_frame_callback,
61
+ )
62
 
63
+ if webrtc_ctx.state.playing:
64
+ st.write("Recording...")
 
 
65
 
66
+ # Convert recorded audio frames to a numpy array for processing
67
+ recorded_audio = []
68
+ while not audio_queue.empty():
69
+ recorded_audio.append(audio_queue.get())
70
 
71
+ if recorded_audio:
72
+ audio_data = np.concatenate(recorded_audio, axis=0)
73
+ sr = 16000 # Assuming a standard sample rate for WebRTC
74
 
75
+ # Compute audio properties
76
+ audio_size = len(audio_data) * 2 # in bytes (16-bit PCM)
77
+ duration = len(audio_data) / sr
78
 
79
+ # Display audio properties
80
+ st.write(f"Audio Size: {audio_size} bytes")
81
+ st.write(f"Frame Rate: {sr} Hz")
82
+ st.write(f"Duration: {duration:.2f} seconds")
83
 
84
+ # Perform conversion using the selected model
85
+ st.subheader("Converting audio to text...")
 
86
 
87
+ start_time = time.time()
88
 
89
+ # Load the model from HuggingFace
90
+ model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])
 
91
 
92
+ # Perform the conversion
93
+ audio_bytes = BytesIO(sf.write("temp.wav", audio_data, sr))
94
+ result = model(audio_bytes)
95
+
96
+ end_time = time.time()
97
+
98
+ # Display results
99
+ st.write("Transcription:", result['text'])
100
+ st.write(f"Conversion took {end_time - start_time:.2f} seconds")
101
+
102
+ # Option 2: Upload audio
103
+ elif audio_option == 'Upload Audio':
104
+ audio_file = st.file_uploader("Upload audio file (WAV format)", type=['wav'])
105
+
106
+ if audio_file:
107
+ # Load the audio file
108
+ with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
109
+ tmp_file.write(audio_file.read())
110
+ tmp_file_path = tmp_file.name
111
+
112
+ audio_data, sr = librosa.load(tmp_file_path, sr=None)
113
+
114
+ # Compute audio properties
115
+ audio_size = len(audio_data) * 2 # in bytes (16-bit PCM)
116
+ frame_rate = sr
117
+ duration = librosa.get_duration(y=audio_data, sr=sr)
118
+
119
+ # Display audio properties
120
+ st.write(f"Audio Size: {audio_size} bytes")
121
+ st.write(f"Frame Rate: {frame_rate} Hz")
122
+ st.write(f"Duration: {duration:.2f} seconds")
123
+
124
+ # Perform conversion using the selected model
125
+ st.subheader("Converting audio to text...")
126
+
127
+ start_time = time.time()
128
+
129
+ # Load the model from HuggingFace
130
+ model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])
131
+
132
+ # Perform the conversion
133
+ audio_bytes = BytesIO(sf.write(tmp_file_path, audio_data, sr))
134
+ result = model(tmp_file_path)
135
+
136
+ end_time = time.time()
137
+
138
+ # Display results
139
+ st.write("Transcription:", result['text'])
140
+ st.write(f"Conversion took {end_time - start_time:.2f} seconds")
141
+
142
+ else:
143
+ st.write("Please select an audio input option.")
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  streamlit
2
  transformers
3
  librosa
4
- soundfile
 
 
1
  streamlit
2
  transformers
3
  librosa
4
+ soundfile
5
+ streamlit_webrtc