import streamlit as st
import soundfile as sf
import librosa
import numpy as np
import time
from transformers import pipeline
from io import BytesIO
import tempfile
from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
import av
import queue

# Define the models
MODELS = {
    "Whisper (English)": "openai/whisper-small.en",
    "Whisper (Multilingual)": "openai/whisper-small",
    "Facebook Wav2Vec2": "facebook/wav2vec2-large-960h",
    "Google Wav2Vec2": "google/wav2vec2-large-xlsr-53",
    "Whisper (Thai)": "openai/whisper-large"
}

# App UI
st.title("Audio to Text Conversion")
st.subheader("Select language and model")

# Language selection
language = st.selectbox("Choose Language", options=["English", "Thai"])

# Model selection
model_choice = st.selectbox("Choose a Model", options=list(MODELS.keys()))

# Audio input options
st.subheader("Record or Upload your audio")
audio_option = st.radio("Choose an option:", ('Record Audio', 'Upload Audio'))

audio_data = None

# Queue to store recorded audio frames
audio_queue = queue.Queue()

# WebRTC Audio Recorder
def audio_frame_callback(frame: av.AudioFrame):
    audio = frame.to_ndarray()
    audio_queue.put(audio)
    return frame

# Option 1: Record audio via browser using WebRTC
if audio_option == 'Record Audio':
    st.write("Click the button to start/stop recording.")

    # Change STUN server to a different one to avoid potential issues
    rtc_configuration = RTCConfiguration(
        {
            "iceServers": [
                {"urls": ["stun:stun1.l.google.com:19302"]}, 
                {"urls": ["stun:stun2.l.google.com:19302"]}
            ]
        }
    )

    # Start WebRTC recording
    webrtc_ctx = webrtc_streamer(
        key="audio-stream",
        mode=WebRtcMode.SENDONLY,
        rtc_configuration=rtc_configuration,
        media_stream_constraints={"audio": True, "video": False},
        audio_frame_callback=audio_frame_callback,
    )

    # Ensure we are recording
    if webrtc_ctx.state.playing:
        st.write("Recording...")

        # Convert recorded audio frames to a numpy array for processing
        recorded_audio = []
        while not audio_queue.empty():
            recorded_audio.append(audio_queue.get())

        if recorded_audio:
            audio_data = np.concatenate(recorded_audio, axis=0)
            sr = 16000  # Assuming a standard sample rate for WebRTC

            # Compute audio properties
            audio_size = len(audio_data) * 2  # in bytes (16-bit PCM)
            duration = len(audio_data) / sr

            # Display audio properties
            st.write(f"Audio Size: {audio_size} bytes")
            st.write(f"Frame Rate: {sr} Hz")
            st.write(f"Duration: {duration:.2f} seconds")

            # Perform conversion using the selected model
            st.subheader("Converting audio to text...")

            start_time = time.time()

            # Load the model from HuggingFace
            model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])

            # Perform the conversion
            audio_bytes = BytesIO(sf.write("temp.wav", audio_data, sr))
            result = model(audio_bytes)

            end_time = time.time()

            # Display results
            st.write("Transcription:", result['text'])
            st.write(f"Conversion took {end_time - start_time:.2f} seconds")

# Option 2: Upload audio
elif audio_option == 'Upload Audio':
    audio_file = st.file_uploader("Upload audio file (WAV format)", type=['wav'])
    
    if audio_file:
        # Load the audio file
        with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
            tmp_file.write(audio_file.read())
            tmp_file_path = tmp_file.name
        
        audio_data, sr = librosa.load(tmp_file_path, sr=None)
        
        # Compute audio properties
        audio_size = len(audio_data) * 2  # in bytes (16-bit PCM)
        frame_rate = sr
        duration = librosa.get_duration(y=audio_data, sr=sr)

        # Display audio properties
        st.write(f"Audio Size: {audio_size} bytes")
        st.write(f"Frame Rate: {frame_rate} Hz")
        st.write(f"Duration: {duration:.2f} seconds")

        # Perform conversion using the selected model
        st.subheader("Converting audio to text...")

        start_time = time.time()

        # Load the model from HuggingFace
        model = pipeline("automatic-speech-recognition", model=MODELS[model_choice])

        # Perform the conversion
        audio_bytes = BytesIO(sf.write(tmp_file_path, audio_data, sr))
        result = model(tmp_file_path)

        end_time = time.time()

        # Display results
        st.write("Transcription:", result['text'])
        st.write(f"Conversion took {end_time - start_time:.2f} seconds")

else:
    st.write("Please select an audio input option.")