Spaces:
Sleeping
Sleeping
import streamlit as st | |
import soundfile as sf | |
import librosa | |
import numpy as np | |
import time | |
from transformers import pipeline | |
from io import BytesIO | |
import tempfile | |
from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration | |
import av | |
import queue | |
# Define the models | |
MODELS = { | |
"Whisper (English)": "openai/whisper-small.en", | |
"Whisper (Multilingual)": "openai/whisper-small", | |
"Facebook Wav2Vec2": "facebook/wav2vec2-large-960h", | |
"Google Wav2Vec2": "google/wav2vec2-large-xlsr-53", | |
"Whisper (Thai)": "openai/whisper-large" | |
} | |
# App UI | |
st.title("Audio to Text Conversion") | |
st.subheader("Select language and model") | |
# Language selection | |
language = st.selectbox("Choose Language", options=["English", "Thai"]) | |
# Model selection | |
model_choice = st.selectbox("Choose a Model", options=list(MODELS.keys())) | |
# Audio input options | |
st.subheader("Record or Upload your audio") | |
audio_option = st.radio("Choose an option:", ('Record Audio', 'Upload Audio')) | |
audio_data = None | |
# Queue to store recorded audio frames | |
audio_queue = queue.Queue() | |
# WebRTC Audio Recorder | |
def audio_frame_callback(frame: av.AudioFrame): | |
audio = frame.to_ndarray() | |
audio_queue.put(audio) | |
return frame | |
# Option 1: Record audio via browser using WebRTC | |
if audio_option == 'Record Audio': | |
st.write("Click the button to start/stop recording.") | |
# Change STUN server to a different one to avoid potential issues | |
rtc_configuration = RTCConfiguration( | |
{ | |
"iceServers": [ | |
{"urls": ["stun:stun1.l.google.com:19302"]}, | |
{"urls": ["stun:stun2.l.google.com:19302"]} | |
] | |
} | |
) | |
# Start WebRTC recording | |
webrtc_ctx = webrtc_streamer( | |
key="audio-stream", | |
mode=WebRtcMode.SENDONLY, | |
rtc_configuration=rtc_configuration, | |
media_stream_constraints={"audio": True, "video": False}, | |
audio_frame_callback=audio_frame_callback, | |
) | |
# Ensure we are recording | |
if webrtc_ctx.state.playing: | |
st.write("Recording...") | |
# Convert recorded audio frames to a numpy array for processing | |
recorded_audio = [] | |
while not audio_queue.empty(): | |
recorded_audio.append(audio_queue.get()) | |
if recorded_audio: | |
audio_data = np.concatenate(recorded_audio, axis=0) | |
sr = 16000 # Assuming a standard sample rate for WebRTC | |
# Compute audio properties | |
audio_size = len(audio_data) * 2 # in bytes (16-bit PCM) | |
duration = len(audio_data) / sr | |
# Display audio properties | |
st.write(f"Audio Size: {audio_size} bytes") | |
st.write(f"Frame Rate: {sr} Hz") | |
st.write(f"Duration: {duration:.2f} seconds") | |
# Perform conversion using the selected model | |
st.subheader("Converting audio to text...") | |
start_time = time.time() | |
# Load the model from HuggingFace | |
model = pipeline("automatic-speech-recognition", model=MODELS[model_choice]) | |
# Perform the conversion | |
audio_bytes = BytesIO(sf.write("temp.wav", audio_data, sr)) | |
result = model(audio_bytes) | |
end_time = time.time() | |
# Display results | |
st.write("Transcription:", result['text']) | |
st.write(f"Conversion took {end_time - start_time:.2f} seconds") | |
# Option 2: Upload audio | |
elif audio_option == 'Upload Audio': | |
audio_file = st.file_uploader("Upload audio file (WAV format)", type=['wav']) | |
if audio_file: | |
# Load the audio file | |
with tempfile.NamedTemporaryFile(delete=False) as tmp_file: | |
tmp_file.write(audio_file.read()) | |
tmp_file_path = tmp_file.name | |
audio_data, sr = librosa.load(tmp_file_path, sr=None) | |
# Compute audio properties | |
audio_size = len(audio_data) * 2 # in bytes (16-bit PCM) | |
frame_rate = sr | |
duration = librosa.get_duration(y=audio_data, sr=sr) | |
# Display audio properties | |
st.write(f"Audio Size: {audio_size} bytes") | |
st.write(f"Frame Rate: {frame_rate} Hz") | |
st.write(f"Duration: {duration:.2f} seconds") | |
# Perform conversion using the selected model | |
st.subheader("Converting audio to text...") | |
start_time = time.time() | |
# Load the model from HuggingFace | |
model = pipeline("automatic-speech-recognition", model=MODELS[model_choice]) | |
# Perform the conversion | |
audio_bytes = BytesIO(sf.write(tmp_file_path, audio_data, sr)) | |
result = model(tmp_file_path) | |
end_time = time.time() | |
# Display results | |
st.write("Transcription:", result['text']) | |
st.write(f"Conversion took {end_time - start_time:.2f} seconds") | |
else: | |
st.write("Please select an audio input option.") |