import spaces import gradio as gr from agents import build_graph # Initialize the graph graph = build_graph() @spaces.GPU(duration=60) def process_and_display(image, voices): if not voices: # If no voices selected return "Please select at least one voice persona.", "No voice personas selected." # Initialize state state = {"image": image, "voices": voices, "caption": "", "descriptions": []} # Run the graph result = graph.invoke(state, {"max_concurrency" : 1}) descriptions:list[str] = result["descriptions"] description = "\n\n---\n\n".join(descriptions) # Return the caption and description return result["caption"], description def create_interface(): with gr.Blocks() as demo: gr.Markdown("# Image Description with Voice Personas") gr.Markdown(""" This app takes an image and generates descriptions using selected voice personas. 1. Upload an image 2. Select voice personas from the multi-select dropdown 3. Click "Generate Description" to see the results The descriptions will be generated in parallel for all selected voices. """) with gr.Row(): with gr.Column(): image_input = gr.Image(type="pil", label="Upload an Image") voice_dropdown = gr.Dropdown( choices=[ "scurvy-ridden pirate", "private investigator", "sarcastic teenager", "forgetful wizard", "shakespearian" ], label="Select Voice Personas (max 2 recommended)", multiselect=True, value=["scurvy-ridden pirate", "private investigator"] ) submit_button = gr.Button("Generate Description") with gr.Column(): caption_output = gr.Textbox(label="Image Caption", lines=4) description_output = gr.Markdown(label="Voice Descriptions") submit_button.click( fn=process_and_display, inputs=[image_input, voice_dropdown], outputs=[caption_output, description_output], ) return demo # Launch the app demo = create_interface() if __name__ == "__main__": demo.launch()