import gradio as gr
import speech_recognition as sr
from PIL import Image
import io
import base64
import json

def process_data(image, audio):
    # Process image: Resize and convert to base64
    if image is not None:
        image = Image.open(image)
        # Resize image, maintaining aspect ratio, and max width 1024 pixels
        base_width = 1024
        w_percent = (base_width / float(image.size[0]))
        h_size = int((float(image.size[1]) * float(w_percent)))
        image = image.resize((base_width, h_size), Image.ANTIALIAS)
        
        # Convert to base64
        buffered = io.BytesIO()
        image.save(buffered, format="JPEG")
        img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
    else:
        img_str = ""

    # Process audio: Convert speech to text
    if audio is not None:
        recognizer = sr.Recognizer()
        with sr.AudioFile(audio) as source:
            audio_data = recognizer.record(source)
        try:
            text = recognizer.recognize_google(audio_data)
        except sr.UnknownValueError:
            text = "Could not understand audio"
        except sr.RequestError as e:
            text = f"Could not request results; {e}"
    else:
        text = ""

    # Prepare JSON data
    data = json.dumps({"image": img_str, "text": text})

    # Here you would add your code to send `data` to the Speckle stream
    # For now, we'll just return the JSON to display it
    return data

with gr.Blocks() as demo:
    gr.Markdown("### Upload Image and Record Voice Message")
    with gr.Row():
        image = gr.Image(type="file", label="Upload Image")
        audio = gr.Audio(source="microphone", type="file", label="Record Voice")
    submit_btn = gr.Button("Submit")
    output = gr.Textbox(label="JSON Output")

    submit_btn.click(fn=process_data, inputs=[image, audio], outputs=output)

demo.launch()