import streamlit as st import soundfile as sf import librosa import numpy as np import time from transformers import pipeline from io import BytesIO import tempfile from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration import av import queue # Define the models MODELS = { "Whisper (English)": "openai/whisper-small.en", "Whisper (Multilingual)": "openai/whisper-small", "Facebook Wav2Vec2": "facebook/wav2vec2-large-960h", "Google Wav2Vec2": "google/wav2vec2-large-xlsr-53", "Whisper (Thai)": "openai/whisper-large" } # App UI st.title("Audio to Text Conversion") st.subheader("Select language and model") # Language selection language = st.selectbox("Choose Language", options=["English", "Thai"]) # Model selection model_choice = st.selectbox("Choose a Model", options=list(MODELS.keys())) # Audio input options st.subheader("Record or Upload your audio") audio_option ="Choose an option:", ('Record Audio', 'Upload Audio')) audio_data = None # Queue to store recorded audio frames audio_queue = queue.Queue() # WebRTC Audio Recorder def audio_frame_callback(frame: av.AudioFrame): audio = frame.to_ndarray() audio_queue.put(audio) return frame # Option 1: Record audio via browser using WebRTC if audio_option == 'Record Audio': st.write("Click the button to start/stop recording.") # Change STUN server to a different one to avoid potential issues rtc_configuration = RTCConfiguration( { "iceServers": [ {"urls": [""]}, {"urls": [""]} ] } ) # Start WebRTC recording webrtc_ctx = webrtc_streamer( key="audio-stream", mode=WebRtcMode.SENDONLY, rtc_configuration=rtc_configuration, media_stream_constraints={"audio": True, "video": False}, audio_frame_callback=audio_frame_callback, ) # Ensure we are recording if webrtc_ctx.state.playing: st.write("Recording...") # Convert recorded audio frames to a numpy array for processing recorded_audio = [] while not audio_queue.empty(): recorded_audio.append(audio_queue.get()) if recorded_audio: audio_data = np.concatenate(recorded_audio, axis=0) sr = 16000 # Assuming a standard sample rate for WebRTC # Compute audio properties audio_size = len(audio_data) * 2 # in bytes (16-bit PCM) duration = len(audio_data) / sr # Display audio properties st.write(f"Audio Size: {audio_size} bytes") st.write(f"Frame Rate: {sr} Hz") st.write(f"Duration: {duration:.2f} seconds") # Perform conversion using the selected model st.subheader("Converting audio to text...") start_time = time.time() # Load the model from HuggingFace model = pipeline("automatic-speech-recognition", model=MODELS[model_choice]) # Perform the conversion audio_bytes = BytesIO(sf.write("temp.wav", audio_data, sr)) result = model(audio_bytes) end_time = time.time() # Display results st.write("Transcription:", result['text']) st.write(f"Conversion took {end_time - start_time:.2f} seconds") # Option 2: Upload audio elif audio_option == 'Upload Audio': audio_file = st.file_uploader("Upload audio file (WAV format)", type=['wav']) if audio_file: # Load the audio file with tempfile.NamedTemporaryFile(delete=False) as tmp_file: tmp_file.write( tmp_file_path = audio_data, sr = librosa.load(tmp_file_path, sr=None) # Compute audio properties audio_size = len(audio_data) * 2 # in bytes (16-bit PCM) frame_rate = sr duration = librosa.get_duration(y=audio_data, sr=sr) # Display audio properties st.write(f"Audio Size: {audio_size} bytes") st.write(f"Frame Rate: {frame_rate} Hz") st.write(f"Duration: {duration:.2f} seconds") # Perform conversion using the selected model st.subheader("Converting audio to text...") start_time = time.time() # Load the model from HuggingFace model = pipeline("automatic-speech-recognition", model=MODELS[model_choice]) # Perform the conversion audio_bytes = BytesIO(sf.write(tmp_file_path, audio_data, sr)) result = model(tmp_file_path) end_time = time.time() # Display results st.write("Transcription:", result['text']) st.write(f"Conversion took {end_time - start_time:.2f} seconds") else: st.write("Please select an audio input option.")