serJD's picture
Create app.py
0ca4d23 verified
raw
history blame
1.89 kB
import gradio as gr
import speech_recognition as sr
from PIL import Image
import io
import base64
import json
def process_data(image, audio):
# Process image: Resize and convert to base64
if image is not None:
image = Image.open(image)
# Resize image, maintaining aspect ratio, and max width 1024 pixels
base_width = 1024
w_percent = (base_width / float(image.size[0]))
h_size = int((float(image.size[1]) * float(w_percent)))
image = image.resize((base_width, h_size), Image.ANTIALIAS)
# Convert to base64
buffered = io.BytesIO()
image.save(buffered, format="JPEG")
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
else:
img_str = ""
# Process audio: Convert speech to text
if audio is not None:
recognizer = sr.Recognizer()
with sr.AudioFile(audio) as source:
audio_data = recognizer.record(source)
try:
text = recognizer.recognize_google(audio_data)
except sr.UnknownValueError:
text = "Could not understand audio"
except sr.RequestError as e:
text = f"Could not request results; {e}"
else:
text = ""
# Prepare JSON data
data = json.dumps({"image": img_str, "text": text})
# Here you would add your code to send `data` to the Speckle stream
# For now, we'll just return the JSON to display it
return data
with gr.Blocks() as demo:
gr.Markdown("### Upload Image and Record Voice Message")
with gr.Row():
image = gr.Image(type="file", label="Upload Image")
audio = gr.Audio(source="microphone", type="file", label="Record Voice")
submit_btn = gr.Button("Submit")
output = gr.Textbox(label="JSON Output")
submit_btn.click(fn=process_data, inputs=[image, audio], outputs=output)
demo.launch()