import streamlit as st import soundfile as sf import librosa import numpy as np import time from transformers import pipeline from io import BytesIO import tempfile from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration import av import queue # Define the models MODELS = { "Whisper (English)": "openai/whisper-small.en", "Whisper (Multilingual)": "openai/whisper-small", "Facebook Wav2Vec2": "facebook/wav2vec2-large-960h", "Google Wav2Vec2": "google/wav2vec2-large-xlsr-53", "Whisper (Thai)": "openai/whisper-large" } # App UI st.title("Audio to Text Conversion") st.subheader("Select language and model") # Language selection language = st.selectbox("Choose Language", options=["English", "Thai"]) # Model selection model_choice = st.selectbox("Choose a Model", options=list(MODELS.keys())) # Audio input options st.subheader("Record or Upload your audio") audio_option = st.radio("Choose an option:", ('Record Audio', 'Upload Audio')) audio_data = None # Queue to store recorded audio frames audio_queue = queue.Queue() # WebRTC Audio Recorder def audio_frame_callback(frame: av.AudioFrame): audio = frame.to_ndarray() audio_queue.put(audio) return frame # Option 1: Record audio via browser using WebRTC if audio_option == 'Record Audio': st.write("Click the button to start/stop recording.") # Change STUN server to a different one to avoid potential issues rtc_configuration = RTCConfiguration( { "iceServers": [ {"urls": ["stun:stun1.l.google.com:19302"]}, {"urls": ["stun:stun2.l.google.com:19302"]} ] } ) # Start WebRTC recording webrtc_ctx = webrtc_streamer( key="audio-stream", mode=WebRtcMode.SENDONLY, rtc_configuration=rtc_configuration, media_stream_constraints={"audio": True, "video": False}, audio_frame_callback=audio_frame_callback, ) # Ensure we are recording if webrtc_ctx.state.playing: st.write("Recording...") # Convert recorded audio frames to a numpy array for processing recorded_audio = [] while not audio_queue.empty(): recorded_audio.append(audio_queue.get()) if recorded_audio: audio_data = np.concatenate(recorded_audio, axis=0) sr = 16000 # Assuming a standard sample rate for WebRTC # Compute audio properties audio_size = len(audio_data) * 2 # in bytes (16-bit PCM) duration = len(audio_data) / sr # Display audio properties st.write(f"Audio Size: {audio_size} bytes") st.write(f"Frame Rate: {sr} Hz") st.write(f"Duration: {duration:.2f} seconds") # Perform conversion using the selected model st.subheader("Converting audio to text...") start_time = time.time() # Load the model from HuggingFace model = pipeline("automatic-speech-recognition", model=MODELS[model_choice]) # Perform the conversion audio_bytes = BytesIO(sf.write("temp.wav", audio_data, sr)) result = model(audio_bytes) end_time = time.time() # Display results st.write("Transcription:", result['text']) st.write(f"Conversion took {end_time - start_time:.2f} seconds") # Option 2: Upload audio elif audio_option == 'Upload Audio': audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "ogg"]) # Assuming you're working with an uploaded file # uploaded_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "ogg"]) if audio_file: # Read the uploaded audio file audio_data, sr = librosa.load(audio_file, sr=None) # sr=None preserves original sample rate # Display information about the audio file st.write(f"Audio file size: {audio_file.size} bytes") st.write(f"Sample rate: {sr}") st.write(f"Duration: {len(audio_data) / sr:.2f} seconds") # Convert the audio to WAV format and save as temporary file with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: sf.write(tmp_file.name, audio_data, sr, format='WAV') tmp_file_path = tmp_file.name # Now you can proceed with the ASR model for conversion st.write("Converting audio to text...") # (Your ASR model conversion code goes here) start_time = time.time() # Load the model from HuggingFace model = pipeline("automatic-speech-recognition", model=MODELS[model_choice]) # Perform the conversion audio_bytes = BytesIO(sf.write(tmp_file_path, audio_data, sr)) result = model(tmp_file_path) end_time = time.time() # Display results st.write("Transcription:", result['text']) st.write(f"Conversion took {end_time - start_time:.2f} seconds") else: st.write("Please select an audio input option.")