AAhad's picture
bug fix
ed262e9
raw
history blame
4.87 kB
import streamlit as st
import soundfile as sf
import librosa
import numpy as np
import time
from transformers import pipeline
from io import BytesIO
import tempfile
from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
import av
import queue
# Define the models
MODELS = {
"Whisper (English)": "openai/whisper-small.en",
"Whisper (Multilingual)": "openai/whisper-small",
"Facebook Wav2Vec2": "facebook/wav2vec2-large-960h",
"Google Wav2Vec2": "google/wav2vec2-large-xlsr-53",
"Whisper (Thai)": "openai/whisper-large"
}
# App UI
st.title("Audio to Text Conversion")
st.subheader("Select language and model")
# Language selection
language = st.selectbox("Choose Language", options=["English", "Thai"])
# Model selection
model_choice = st.selectbox("Choose a Model", options=list(MODELS.keys()))
# Audio input options
st.subheader("Record or Upload your audio")
audio_option = st.radio("Choose an option:", ('Record Audio', 'Upload Audio'))
audio_data = None
# Queue to store recorded audio frames
audio_queue = queue.Queue()
# WebRTC Audio Recorder
def audio_frame_callback(frame: av.AudioFrame):
audio = frame.to_ndarray()
audio_queue.put(audio)
return frame
# Option 1: Record audio via browser using WebRTC
if audio_option == 'Record Audio':
st.write("Click the button to start/stop recording.")
# Change STUN server to a different one to avoid potential issues
rtc_configuration = RTCConfiguration(
{
"iceServers": [
{"urls": ["stun:stun1.l.google.com:19302"]},
{"urls": ["stun:stun2.l.google.com:19302"]}
]
}
)
# Start WebRTC recording
webrtc_ctx = webrtc_streamer(
key="audio-stream",
mode=WebRtcMode.SENDONLY,
rtc_configuration=rtc_configuration,
media_stream_constraints={"audio": True, "video": False},
audio_frame_callback=audio_frame_callback,
)
# Ensure we are recording
if webrtc_ctx.state.playing:
st.write("Recording...")
# Convert recorded audio frames to a numpy array for processing
recorded_audio = []
while not audio_queue.empty():
recorded_audio.append(audio_queue.get())
if recorded_audio:
audio_data = np.concatenate(recorded_audio, axis=0)
sr = 16000 # Assuming a standard sample rate for WebRTC
# Compute audio properties
audio_size = len(audio_data) * 2 # in bytes (16-bit PCM)
duration = len(audio_data) / sr
# Display audio properties
st.write(f"Audio Size: {audio_size} bytes")
st.write(f"Frame Rate: {sr} Hz")
st.write(f"Duration: {duration:.2f} seconds")
# Perform conversion using the selected model
st.subheader("Converting audio to text...")
start_time = time.time()
# Load the model from HuggingFace
model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])
# Perform the conversion
audio_bytes = BytesIO(sf.write("temp.wav", audio_data, sr))
result = model(audio_bytes)
end_time = time.time()
# Display results
st.write("Transcription:", result['text'])
st.write(f"Conversion took {end_time - start_time:.2f} seconds")
# Option 2: Upload audio
elif audio_option == 'Upload Audio':
audio_file = st.file_uploader("Upload audio file (WAV format)", type=['wav'])
if audio_file:
# Load the audio file
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
tmp_file.write(audio_file.read())
tmp_file_path = tmp_file.name
audio_data, sr = librosa.load(tmp_file_path, sr=None)
# Compute audio properties
audio_size = len(audio_data) * 2 # in bytes (16-bit PCM)
frame_rate = sr
duration = librosa.get_duration(y=audio_data, sr=sr)
# Display audio properties
st.write(f"Audio Size: {audio_size} bytes")
st.write(f"Frame Rate: {frame_rate} Hz")
st.write(f"Duration: {duration:.2f} seconds")
# Perform conversion using the selected model
st.subheader("Converting audio to text...")
start_time = time.time()
# Load the model from HuggingFace
model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])
# Perform the conversion
audio_bytes = BytesIO(sf.write(tmp_file_path, audio_data, sr))
result = model(tmp_file_path)
end_time = time.time()
# Display results
st.write("Transcription:", result['text'])
st.write(f"Conversion took {end_time - start_time:.2f} seconds")
else:
st.write("Please select an audio input option.")