fun-image-caption

Running on Zero

File size: 2,382 Bytes

6f322bd
e82b768
 
a4690cb
e82b768
a4690cb
 
 
 
73d2daa
b5b9453
 
 
 
a4690cb
b5b9453
a4690cb
 
7d14b9f
a4690cb
598dcfa
c893ae0
598dcfa
a4690cb
598dcfa
a4690cb
 
 
 
 
 
b5b9453
a4690cb
 
b5b9453
a4690cb
b5b9453
 
a4690cb
 
 
 
 
 
 
 
598dcfa
b5b9453
 
598dcfa
a4690cb
b5b9453
 
 
a4690cb
 
 
 
b5b9453
68fe4b2
a4690cb
 
 
 
 
 
 
 
 
 
 
 
 
 
b5b9453

import spaces
import gradio as gr

from agents import build_graph

# Initialize the graph
graph = build_graph()


@spaces.GPU(duration=60)
def process_and_display(image, voices):
    if not voices:  # If no voices selected
        return "Please select at least one voice persona.", "No voice personas selected."
    
    # Initialize state
    state = {"image": image, "voices": voices, "caption": "", "descriptions": []}

    # Run the graph
    result = graph.invoke(state, {"max_concurrency" : 1})

    descriptions:list[str] = result["descriptions"]
    description = "\n\n---\n\n".join(descriptions)

    # Return the caption and description
    return result["caption"], description


def create_interface():
    with gr.Blocks() as demo:
        gr.Markdown("# Image Description with Voice Personas")
        gr.Markdown("""
        This app takes an image and generates descriptions using selected voice personas.
        
        1. Upload an image
        2. Select voice personas from the multi-select dropdown
        3. Click "Generate Description" to see the results
        
        The descriptions will be generated in parallel for all selected voices.
        """)

        with gr.Row():
            with gr.Column():
                image_input = gr.Image(type="pil", label="Upload an Image")
                voice_dropdown = gr.Dropdown(
                    choices=[
                        "scurvy-ridden pirate",
                        "private investigator",
                        "sarcastic teenager",
                        "forgetful wizard",
                        "shakespearian"
                    ],
                    label="Select Voice Personas (max 2 recommended)",
                    multiselect=True,
                    value=["scurvy-ridden pirate", "private investigator"]
                )
                submit_button = gr.Button("Generate Description")

            with gr.Column():
                caption_output = gr.Textbox(label="Image Caption", lines=4)
                description_output = gr.Markdown(label="Voice Descriptions")

        submit_button.click(
            fn=process_and_display,
            inputs=[image_input, voice_dropdown],
            outputs=[caption_output, description_output],
        )

    return demo


# Launch the app
demo = create_interface()

if __name__ == "__main__":
    demo.launch()