AAhad's picture
audi file format
8742ee2
import streamlit as st
import soundfile as sf
import librosa
import numpy as np
import time
from transformers import pipeline
from io import BytesIO
import tempfile
from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
import av
import queue
# Define the models
MODELS = {
"Whisper (English)": "openai/whisper-small.en",
"Whisper (Multilingual)": "openai/whisper-small",
"Facebook Wav2Vec2": "facebook/wav2vec2-large-960h",
"Google Wav2Vec2": "google/wav2vec2-large-xlsr-53",
"Whisper (Thai)": "openai/whisper-large"
}
# App UI
st.title("Audio to Text Conversion")
st.subheader("Select language and model")
# Language selection
language = st.selectbox("Choose Language", options=["English", "Thai"])
# Model selection
model_choice = st.selectbox("Choose a Model", options=list(MODELS.keys()))
# Audio input options
st.subheader("Record or Upload your audio")
audio_option = st.radio("Choose an option:", ('Record Audio', 'Upload Audio'))
audio_data = None
# Queue to store recorded audio frames
audio_queue = queue.Queue()
# WebRTC Audio Recorder
def audio_frame_callback(frame: av.AudioFrame):
audio = frame.to_ndarray()
audio_queue.put(audio)
return frame
# Option 1: Record audio via browser using WebRTC
if audio_option == 'Record Audio':
st.write("Click the button to start/stop recording.")
# Change STUN server to a different one to avoid potential issues
rtc_configuration = RTCConfiguration(
{
"iceServers": [
{"urls": ["stun:stun1.l.google.com:19302"]},
{"urls": ["stun:stun2.l.google.com:19302"]}
]
}
)
# Start WebRTC recording
webrtc_ctx = webrtc_streamer(
key="audio-stream",
mode=WebRtcMode.SENDONLY,
rtc_configuration=rtc_configuration,
media_stream_constraints={"audio": True, "video": False},
audio_frame_callback=audio_frame_callback,
)
# Ensure we are recording
if webrtc_ctx.state.playing:
st.write("Recording...")
# Convert recorded audio frames to a numpy array for processing
recorded_audio = []
while not audio_queue.empty():
recorded_audio.append(audio_queue.get())
if recorded_audio:
audio_data = np.concatenate(recorded_audio, axis=0)
sr = 16000 # Assuming a standard sample rate for WebRTC
# Compute audio properties
audio_size = len(audio_data) * 2 # in bytes (16-bit PCM)
duration = len(audio_data) / sr
# Display audio properties
st.write(f"Audio Size: {audio_size} bytes")
st.write(f"Frame Rate: {sr} Hz")
st.write(f"Duration: {duration:.2f} seconds")
# Perform conversion using the selected model
st.subheader("Converting audio to text...")
start_time = time.time()
# Load the model from HuggingFace
model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])
# Perform the conversion
audio_bytes = BytesIO(sf.write("temp.wav", audio_data, sr))
result = model(audio_bytes)
end_time = time.time()
# Display results
st.write("Transcription:", result['text'])
st.write(f"Conversion took {end_time - start_time:.2f} seconds")
# Option 2: Upload audio
elif audio_option == 'Upload Audio':
audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "ogg"])
# Assuming you're working with an uploaded file
# uploaded_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "ogg"])
if audio_file:
# Read the uploaded audio file
audio_data, sr = librosa.load(audio_file, sr=None) # sr=None preserves original sample rate
# Display information about the audio file
st.write(f"Audio file size: {audio_file.size} bytes")
st.write(f"Sample rate: {sr}")
st.write(f"Duration: {len(audio_data) / sr:.2f} seconds")
# Convert the audio to WAV format and save as temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
sf.write(tmp_file.name, audio_data, sr, format='WAV')
tmp_file_path = tmp_file.name
# Now you can proceed with the ASR model for conversion
st.write("Converting audio to text...")
# (Your ASR model conversion code goes here)
start_time = time.time()
# Load the model from HuggingFace
model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])
# Perform the conversion
audio_bytes = BytesIO(sf.write(tmp_file_path, audio_data, sr))
result = model(tmp_file_path)
end_time = time.time()
# Display results
st.write("Transcription:", result['text'])
st.write(f"Conversion took {end_time - start_time:.2f} seconds")
else:
st.write("Please select an audio input option.")