import gradio as gr import speech_recognition as sr from PIL import Image import io import base64 import json def process_data(image, audio): # Process image: Resize and convert to base64 if image is not None: image = Image.open(image) # Resize image, maintaining aspect ratio, and max width 1024 pixels base_width = 1024 w_percent = (base_width / float(image.size[0])) h_size = int((float(image.size[1]) * float(w_percent))) image = image.resize((base_width, h_size), Image.ANTIALIAS) # Convert to base64 buffered = io.BytesIO() image.save(buffered, format="JPEG") img_str = base64.b64encode(buffered.getvalue()).decode('utf-8') else: img_str = "" # Process audio: Convert speech to text if audio is not None: recognizer = sr.Recognizer() with sr.AudioFile(audio) as source: audio_data = recognizer.record(source) try: text = recognizer.recognize_google(audio_data) except sr.UnknownValueError: text = "Could not understand audio" except sr.RequestError as e: text = f"Could not request results; {e}" else: text = "" # Prepare JSON data data = json.dumps({"image": img_str, "text": text}) # Here you would add your code to send `data` to the Speckle stream # For now, we'll just return the JSON to display it return data with gr.Blocks() as demo: gr.Markdown("### Upload Image and Record Voice Message") with gr.Row(): image = gr.Image(type="file", label="Upload Image") audio = gr.Audio(source="microphone", type="file", label="Record Voice") submit_btn = gr.Button("Submit") output = gr.Textbox(label="JSON Output") submit_btn.click(fn=process_data, inputs=[image, audio], outputs=output) demo.launch()