Spaces:

AAhad
/

thai-audio-to-text

Running

File size: 4,868 Bytes

3f2970b
 
 
 
 
 
 
e16e61e
0c5ffc2
e16e61e
 
3f2970b
ed262e9
3f2970b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e16e61e
 
 
3f2970b
e16e61e
3f2970b
e16e61e
 
 
 
 
 
 
 
 
 
 
 
 
ed262e9
0c5ffc2
ed262e9
 
 
 
 
 
0c5ffc2
ed262e9
 
e16e61e
 
 
0c5ffc2
 
e16e61e
 
3f2970b
ed262e9
e16e61e
 
3f2970b
e16e61e
 
 
 
3f2970b
e16e61e
 
 
3f2970b
e16e61e
 
 
3f2970b
e16e61e
 
 
 
3f2970b
e16e61e
 
3f2970b
e16e61e
3f2970b
e16e61e
 
3f2970b
e16e61e

import streamlit as st
import soundfile as sf
import librosa
import numpy as np
import time
from transformers import pipeline
from io import BytesIO
import tempfile
from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
import av
import queue

# Define the models
MODELS = {
    "Whisper (English)": "openai/whisper-small.en",
    "Whisper (Multilingual)": "openai/whisper-small",
    "Facebook Wav2Vec2": "facebook/wav2vec2-large-960h",
    "Google Wav2Vec2": "google/wav2vec2-large-xlsr-53",
    "Whisper (Thai)": "openai/whisper-large"
}

# App UI
st.title("Audio to Text Conversion")
st.subheader("Select language and model")

# Language selection
language = st.selectbox("Choose Language", options=["English", "Thai"])

# Model selection
model_choice = st.selectbox("Choose a Model", options=list(MODELS.keys()))

# Audio input options
st.subheader("Record or Upload your audio")
audio_option = st.radio("Choose an option:", ('Record Audio', 'Upload Audio'))

audio_data = None

# Queue to store recorded audio frames
audio_queue = queue.Queue()

# WebRTC Audio Recorder
def audio_frame_callback(frame: av.AudioFrame):
    audio = frame.to_ndarray()
    audio_queue.put(audio)
    return frame

# Option 1: Record audio via browser using WebRTC
if audio_option == 'Record Audio':
    st.write("Click the button to start/stop recording.")

    # Change STUN server to a different one to avoid potential issues
    rtc_configuration = RTCConfiguration(
        {
            "iceServers": [
                {"urls": ["stun:stun1.l.google.com:19302"]}, 
                {"urls": ["stun:stun2.l.google.com:19302"]}
            ]
        }
    )

    # Start WebRTC recording
    webrtc_ctx = webrtc_streamer(
        key="audio-stream",
        mode=WebRtcMode.SENDONLY,
        rtc_configuration=rtc_configuration,
        media_stream_constraints={"audio": True, "video": False},
        audio_frame_callback=audio_frame_callback,
    )

    # Ensure we are recording
    if webrtc_ctx.state.playing:
        st.write("Recording...")

        # Convert recorded audio frames to a numpy array for processing
        recorded_audio = []
        while not audio_queue.empty():
            recorded_audio.append(audio_queue.get())

        if recorded_audio:
            audio_data = np.concatenate(recorded_audio, axis=0)
            sr = 16000  # Assuming a standard sample rate for WebRTC

            # Compute audio properties
            audio_size = len(audio_data) * 2  # in bytes (16-bit PCM)
            duration = len(audio_data) / sr

            # Display audio properties
            st.write(f"Audio Size: {audio_size} bytes")
            st.write(f"Frame Rate: {sr} Hz")
            st.write(f"Duration: {duration:.2f} seconds")

            # Perform conversion using the selected model
            st.subheader("Converting audio to text...")

            start_time = time.time()

            # Load the model from HuggingFace
            model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])

            # Perform the conversion
            audio_bytes = BytesIO(sf.write("temp.wav", audio_data, sr))
            result = model(audio_bytes)

            end_time = time.time()

            # Display results
            st.write("Transcription:", result['text'])
            st.write(f"Conversion took {end_time - start_time:.2f} seconds")

# Option 2: Upload audio
elif audio_option == 'Upload Audio':
    audio_file = st.file_uploader("Upload audio file (WAV format)", type=['wav'])
    
    if audio_file:
        # Load the audio file
        with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
            tmp_file.write(audio_file.read())
            tmp_file_path = tmp_file.name
        
        audio_data, sr = librosa.load(tmp_file_path, sr=None)
        
        # Compute audio properties
        audio_size = len(audio_data) * 2  # in bytes (16-bit PCM)
        frame_rate = sr
        duration = librosa.get_duration(y=audio_data, sr=sr)

        # Display audio properties
        st.write(f"Audio Size: {audio_size} bytes")
        st.write(f"Frame Rate: {frame_rate} Hz")
        st.write(f"Duration: {duration:.2f} seconds")

        # Perform conversion using the selected model
        st.subheader("Converting audio to text...")

        start_time = time.time()

        # Load the model from HuggingFace
        model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])

        # Perform the conversion
        audio_bytes = BytesIO(sf.write(tmp_file_path, audio_data, sr))
        result = model(tmp_file_path)

        end_time = time.time()

        # Display results
        st.write("Transcription:", result['text'])
        st.write(f"Conversion took {end_time - start_time:.2f} seconds")

else:
    st.write("Please select an audio input option.")