Spaces:

AAhad
/

thai-audio-to-text

Sleeping

App Files Files Community

AAhad commited on Sep 25, 2024

Commit

e16e61e

1 Parent(s): 3f2970b

added webrtc

Browse files

Files changed (2) hide show

app.py +105 -33
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -5,6 +5,10 @@ import numpy as np
 import time
 from transformers import pipeline
 from io import BytesIO
 # Define the models (You can replace these with any other top models supporting audio input)
 MODELS = {
@@ -25,47 +29,115 @@ language = st.selectbox("Choose Language", options=["English", "Thai"])
 # Model selection
 model_choice = st.selectbox("Choose a Model", options=list(MODELS.keys()))
-# Record audio
-st.subheader("Record your audio")
-audio_recorder = st.audio("")
-if st.button("Start Recording"):
-    # Add code here to handle audio recording via mic or upload if needed
-    st.warning("Audio recording functionality needs to be implemented")
-# Placeholder for conversion metrics
-if audio_recorder:
-    st.write("Recording audio metrics...")
-    # Read audio file
-    audio_data, sr = librosa.load(audio_recorder, sr=None)
-    # Compute audio properties
-    audio_size = len(audio_data) * 2  # in bytes (16-bit PCM)
-    frame_rate = sr
-    duration = librosa.get_duration(y=audio_data, sr=sr)
-    # Display audio properties
-    st.write(f"Audio Size: {audio_size} bytes")
-    st.write(f"Frame Rate: {frame_rate} Hz")
-    st.write(f"Duration: {duration:.2f} seconds")
-    # Perform conversion using the selected model
-    st.subheader("Converting audio to text...")
-    start_time = time.time()
-    # Load the model from HuggingFace
-    model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])
-    # Perform the conversion
-    audio_bytes = BytesIO(sf.write("temp.wav", audio_data, sr))
-    result = model(audio_bytes)
-    end_time = time.time()
-    # Display results
-    st.write("Transcription:", result['text'])
-    st.write(f"Conversion took {end_time - start_time:.2f} seconds")
-# Provide placeholder for actual audio recording functionality if necessary.

 import time
 from transformers import pipeline
 from io import BytesIO
+import tempfile
+from streamlit_webrtc import webrtc_streamer, WebRtcMode, ClientSettings
+import av
+import queue
 # Define the models (You can replace these with any other top models supporting audio input)
 MODELS = {
 # Model selection
 model_choice = st.selectbox("Choose a Model", options=list(MODELS.keys()))
+# Audio input options
+st.subheader("Record or Upload your audio")
+audio_option = st.radio("Choose an option:", ('Record Audio', 'Upload Audio'))
+audio_data = None
+# Queue to store recorded audio frames
+audio_queue = queue.Queue()
+# WebRTC Audio Recorder
+def audio_frame_callback(frame: av.AudioFrame):
+    audio = frame.to_ndarray()
+    audio_queue.put(audio)
+    return frame
+# Option 1: Record audio via browser using WebRTC
+if audio_option == 'Record Audio':
+    st.write("Click the button to start/stop recording.")
+    webrtc_ctx = webrtc_streamer(
+        key="audio-stream",
+        mode=WebRtcMode.SENDONLY,
+        client_settings=ClientSettings(
+            media_stream_constraints={
+                "audio": True,
+                "video": False,
+            }
+        ),
+        audio_frame_callback=audio_frame_callback,
+    )
+    if webrtc_ctx.state.playing:
+        st.write("Recording...")
+        # Convert recorded audio frames to a numpy array for processing
+        recorded_audio = []
+        while not audio_queue.empty():
+            recorded_audio.append(audio_queue.get())
+        if recorded_audio:
+            audio_data = np.concatenate(recorded_audio, axis=0)
+            sr = 16000  # Assuming a standard sample rate for WebRTC
+            # Compute audio properties
+            audio_size = len(audio_data) * 2  # in bytes (16-bit PCM)
+            duration = len(audio_data) / sr
+            # Display audio properties
+            st.write(f"Audio Size: {audio_size} bytes")
+            st.write(f"Frame Rate: {sr} Hz")
+            st.write(f"Duration: {duration:.2f} seconds")
+            # Perform conversion using the selected model
+            st.subheader("Converting audio to text...")
+            start_time = time.time()
+            # Load the model from HuggingFace
+            model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])
+            # Perform the conversion
+            audio_bytes = BytesIO(sf.write("temp.wav", audio_data, sr))
+            result = model(audio_bytes)
+            end_time = time.time()
+            # Display results
+            st.write("Transcription:", result['text'])
+            st.write(f"Conversion took {end_time - start_time:.2f} seconds")
+# Option 2: Upload audio
+elif audio_option == 'Upload Audio':
+    audio_file = st.file_uploader("Upload audio file (WAV format)", type=['wav'])
+    if audio_file:
+        # Load the audio file
+        with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+            tmp_file.write(audio_file.read())
+            tmp_file_path = tmp_file.name
+        audio_data, sr = librosa.load(tmp_file_path, sr=None)
+        # Compute audio properties
+        audio_size = len(audio_data) * 2  # in bytes (16-bit PCM)
+        frame_rate = sr
+        duration = librosa.get_duration(y=audio_data, sr=sr)
+        # Display audio properties
+        st.write(f"Audio Size: {audio_size} bytes")
+        st.write(f"Frame Rate: {frame_rate} Hz")
+        st.write(f"Duration: {duration:.2f} seconds")
+        # Perform conversion using the selected model
+        st.subheader("Converting audio to text...")
+        start_time = time.time()
+        # Load the model from HuggingFace
+        model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])
+        # Perform the conversion
+        audio_bytes = BytesIO(sf.write(tmp_file_path, audio_data, sr))
+        result = model(tmp_file_path)
+        end_time = time.time()
+        # Display results
+        st.write("Transcription:", result['text'])
+        st.write(f"Conversion took {end_time - start_time:.2f} seconds")
+else:
+    st.write("Please select an audio input option.")

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 streamlit
 transformers
 librosa
-soundfile

 streamlit
 transformers
 librosa
+soundfile
+streamlit_webrtc